/*
 * Copyright © 2023, VideoLAN and dav1d authors
 * Copyright © 2023, Loongson Technology Corporation Limited
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice, this
 *    list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "src/loongarch/loongson_asm.S"
#include "src/loongarch/loongson_util.S"

.macro PUSH_REG
    addi.d           sp,     sp,    -64
    fst.d            f24,    sp,     0
    fst.d            f25,    sp,     8
    fst.d            f26,    sp,     16
    fst.d            f27,    sp,     24
    fst.d            f28,    sp,     32
    fst.d            f29,    sp,     40
    fst.d            f30,    sp,     48
    fst.d            f31,    sp,     56
.endm

.macro POP_REG
    fld.d            f24,    sp,     0
    fld.d            f25,    sp,     8
    fld.d            f26,    sp,     16
    fld.d            f27,    sp,     24
    fld.d            f28,    sp,     32
    fld.d            f29,    sp,     40
    fld.d            f30,    sp,     48
    fld.d            f31,    sp,     56
    addi.d           sp,     sp,     64
.endm

.macro malloc_space number
    li.w          t0,       \number
    sub.d         sp,       sp,       t0
    addi.d        sp,       sp,       -64
    PUSH_REG
.endm

.macro free_space number
    POP_REG
    li.w          t0,       \number
    add.d         sp,       sp,       t0
    addi.d        sp,       sp,       64
.endm

.macro iwht4
    vadd.h        vr0,       vr0,     vr1
    vsub.h        vr4,       vr2,     vr3
    vsub.h        vr5,       vr0,     vr4
    vsrai.h       vr5,       vr5,     1
    vsub.h        vr2,       vr5,     vr1
    vsub.h        vr1,       vr5,     vr3
    vadd.h        vr3,       vr4,     vr2
    vsub.h        vr0,       vr0,     vr1
.endm

.macro DST_ADD_W4 in0, in1, in2, in3, in4, in5
    vilvl.w       \in0,     \in1,     \in0  // 0 1  2  3  4  5  6  7 x ...
    vilvl.w       \in2,     \in3,     \in2  // 8 9 10 11 12 13 14 15 x ...
    vsllwil.hu.bu \in0,     \in0,     0
    vsllwil.hu.bu \in2,     \in2,     0
    vadd.h        \in0,     \in4,     \in0
    vadd.h        \in2,     \in5,     \in2
    vssrani.bu.h  \in2,     \in0,     0
    vstelm.w      \in2,     a0,       0,    0
    vstelmx.w     \in2,     a0,       a1,   1
    vstelmx.w     \in2,     a0,       a1,   2
    vstelmx.w     \in2,     a0,       a1,   3
.endm

.macro VLD_DST_ADD_W4 in0, in1
    vld           vr0,      a0,       0
    vldx          vr1,      a0,       a1
    vld           vr2,      t2,       0
    vldx          vr3,      t2,       a1

    DST_ADD_W4    vr0, vr1, vr2, vr3, \in0, \in1
.endm

function inv_txfm_add_wht_wht_4x4_8bpc_lsx
    vld           vr0,       a2,      0
    vld           vr2,       a2,      16

    vxor.v        vr20,      vr20,    vr20
    vsrai.h       vr0,       vr0,     2
    vsrai.h       vr2,       vr2,     2
    vst           vr20,      a2,      0
    vpickod.d     vr1,       vr0,     vr0
    vpickod.d     vr3,       vr2,     vr2
    vst           vr20,      a2,      16

    iwht4

    LSX_TRANSPOSE4x4_H vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, vr4, vr5

    iwht4

    vilvl.d       vr4,       vr1,     vr0
    vilvl.d       vr5,       vr3,     vr2
    alsl.d        t2,        a1,      a0,    1
    VLD_DST_ADD_W4 vr4, vr5
endfunc

const idct_coeffs, align=4
    .word          2896, 2896*8, 1567, 3784
    .word          799, 4017, 3406, 2276
    .word          401, 4076, 3166, 2598
    .word          1931, 3612, 3920, 1189
    .word          201, 4091, 3035, 2751
    .word          1751, 3703, 3857, 1380
    .word          995, 3973, 3513, 2106
    .word          2440, 3290, 4052, 601
endconst

.macro vsrari_h_x4 in0, in1, in2, in3, out0, out1, out2, out3, shift
    vsrari.h      \out0,    \in0,     \shift
    vsrari.h      \out1,    \in1,     \shift
    vsrari.h      \out2,    \in2,     \shift
    vsrari.h      \out3,    \in3,     \shift
.endm

.macro vsrari_h_x8 in0, in1, in2, in3, in4, in5, in6, in7, out0, \
                   out1, out2, out3, out4, out5, out6, out7, shift
    vsrari.h      \out0,    \in0,     \shift
    vsrari.h      \out1,    \in1,     \shift
    vsrari.h      \out2,    \in2,     \shift
    vsrari.h      \out3,    \in3,     \shift
    vsrari.h      \out4,    \in4,     \shift
    vsrari.h      \out5,    \in5,     \shift
    vsrari.h      \out6,    \in6,     \shift
    vsrari.h      \out7,    \in7,     \shift
.endm

.macro vmulev_vmaddod_lsx in0, in1, in2, in3, out0, out1, sz
    vmulwev.w.h   \out0,    \in0,     \in2
    vmulwod.w.h   \out1,    \in0,     \in2
    vmaddwev.w.h  \out0,    \in1,     \in3
    vmaddwod.w.h  \out1,    \in1,     \in3
.ifc \sz, .4h
    vilvl.w       \out0,    \out1,    \out0
.else
    vilvl.w       vr22,     \out1,    \out0
    vilvh.w       \out1,    \out1,    \out0
    vor.v         \out0,    vr22,     vr22
.endif
.endm

const idct_coeffs_h, align=4
    .short          2896, 2896*8, 1567, 3784
    .short          799, 4017, 3406, 2276
    .short          401, 4076, 3166, 2598
    .short          1931, 3612, 3920, 1189
    .short          201, 4091, 3035, 2751
    .short          1751, 3703, 3857, 1380
    .short          995, 3973, 3513, 2106
    .short          2440, 3290, 4052, 601
endconst

const iadst4_coeffs, align=4
    .word          1321, 3803, 2482, 3344
endconst

.macro inv_dct4_lsx in0, in1, in2, in3, out0, out1, out2, out3, sz
    la.local      t0,       idct_coeffs_h

    vldrepl.h     vr20,     t0,       0    // 2896
    vmulev_vmaddod_lsx \in0, \in2, vr20, vr20, vr16, vr18, \sz
    vneg.h        vr21,     vr20
    vmulev_vmaddod_lsx \in0, \in2, vr20, vr21, vr17, vr19, \sz
    vssrarni.h.w  vr18,     vr16,     12   // t0
    vssrarni.h.w  vr19,     vr17,     12   // t1

    vldrepl.h     vr20,     t0,       4    // 1567
    vldrepl.h     vr21,     t0,       6    // 3784
    vmulev_vmaddod_lsx \in1, \in3, vr21, vr20, \in0, vr16, \sz
    vneg.h        vr21,     vr21
    vmulev_vmaddod_lsx \in1, \in3, vr20, vr21, \in2, vr17, \sz
    vssrarni.h.w  vr16,     \in0,     12   // t3
    vssrarni.h.w  vr17,     \in2,     12   // t2

    vsadd.h       \out0,    vr18,     vr16
    vsadd.h       \out1,    vr19,     vr17
    vssub.h       \out2,    vr19,     vr17
    vssub.h       \out3,    vr18,     vr16
.endm

functionl inv_dct_4h_x4_lsx
    inv_dct4_lsx vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, .4h
endfuncl

functionl inv_dct_8h_x4_lsx
    inv_dct4_lsx vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, .8h
endfuncl

.macro inv_adst4_core_lsx in0, in1, in2, in3, out0, out1, out2, out3
    vsub.w        vr16,     \in0,    \in2  // in0-in2
    vmul.w        vr17,     \in0,    vr20  // in0*1321
    vmul.w        vr19,     \in0,    vr22  // in0*2482
    vmul.w        vr18,     \in1,    vr23  // in1*3344
    vmadd.w       vr17,     \in2,    vr21  // in0*1321+in2*3803
    vmsub.w       vr19,     \in2,    vr20  // in2*1321
    vadd.w        vr16,     vr16,    \in3  // in0-in2+in3
    vmadd.w       vr17,     \in3,    vr22  // in0*1321+in2*3803+in3*2482
    vmsub.w       vr19,     \in3,    vr21  // in0*2482-in2*1321-in3*3803
    vadd.w        vr15,     vr17,    vr19
    vmul.w        \out2,    vr16,    vr23  // out[2] 8  9  10 11
    vadd.w        \out0,    vr17,    vr18  // out[0] 0  1  2  3
    vadd.w        \out1,    vr19,    vr18  // out[1] 4  5  6  7
    vsub.w        \out3,    vr15,    vr18  // out[3] 12 13 14 15
.endm

.macro inv_adst4_lsx in0, in1, in2, in3, out0, out1, out2, out3
    la.local      t0,       iadst4_coeffs

    vldrepl.w     vr20,     t0,      0     // 1321
    vldrepl.w     vr21,     t0,      4     // 3803
    vldrepl.w     vr22,     t0,      8     // 2482
    vldrepl.w     vr23,     t0,      12    // 3344

    vsllwil.w.h   vr0,      \in0,    0
    vsllwil.w.h   vr1,      \in1,    0
    vsllwil.w.h   vr2,      \in2,    0
    vsllwil.w.h   vr3,      \in3,    0
    inv_adst4_core_lsx vr0, vr1, vr2, vr3, \out0, \out1, \out2, \out3
    vssrarni.h.w  \out0,    \out0,   12
    vssrarni.h.w  \out1,    \out1,   12
    vssrarni.h.w  \out2,    \out2,   12
    vssrarni.h.w  \out3,    \out3,   12
.endm

functionl inv_adst_4h_x4_lsx
    inv_adst4_lsx vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3
endfuncl

functionl inv_flipadst_4h_x4_lsx
    inv_adst4_lsx vr0, vr1, vr2, vr3, vr3, vr2, vr1, vr0
endfuncl

.macro inv_adst_8x4_lsx in0, in1, in2, in3, out0, out1, out2, out3
    la.local      t0,       iadst4_coeffs
    vldrepl.w     vr20,     t0,      0     // 1321
    vldrepl.w     vr21,     t0,      4     // 3803
    vldrepl.w     vr22,     t0,      8     // 2482
    vldrepl.w     vr23,     t0,      12    // 3344

    vsllwil.w.h   vr10,     \in0,     0     // in0
    vsllwil.w.h   vr11,     \in1,     0     // in1
    vsllwil.w.h   vr12,     \in2,     0     // in2
    vsllwil.w.h   vr13,     \in3,     0     // in3
    inv_adst4_core_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13

    vexth.w.h     \in0,      \in0           // in0
    vexth.w.h     \in1,      \in1           // in1
    vexth.w.h     \in2,      \in2           // in2
    vexth.w.h     \in3,      \in3           // in3
    inv_adst4_core_lsx \in0, \in1, \in2, \in3, \out0, \out1, \out2, \out3

    vssrarni.h.w  \out0,     vr10,    12
    vssrarni.h.w  \out1,     vr11,    12
    vssrarni.h.w  \out2,     vr12,    12
    vssrarni.h.w  \out3,     vr13,    12
.endm

functionl inv_adst_8h_x4_lsx
    inv_adst_8x4_lsx vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3
endfuncl

functionl inv_flipadst_8h_x4_lsx
    inv_adst_8x4_lsx vr0, vr1, vr2, vr3, vr3, vr2, vr1, vr0
endfuncl

functionl inv_identity_4h_x4_lsx
    li.w          t0,       1697
    vreplgr2vr.h  vr20,     t0

    vilvl.d       vr0,      vr1,      vr0
    vilvl.d       vr2,      vr3,      vr2
    vmulwev.w.h   vr16,     vr0,      vr20
    vmulwod.w.h   vr17,     vr0,      vr20
    vmulwev.w.h   vr18,     vr2,      vr20
    vmulwod.w.h   vr19,     vr2,      vr20
    vilvl.w       vr1,      vr17,     vr16
    vilvh.w       vr3,      vr17,     vr16
    vilvl.w       vr22,     vr19,     vr18
    vilvh.w       vr23,     vr19,     vr18
    vssrarni.h.w  vr3,      vr1,      12
    vssrarni.h.w  vr23,     vr22,     12
    vsadd.h       vr0,      vr3,      vr0  // t0
    vsadd.h       vr2,      vr23,     vr2  // t2
    vilvh.d       vr1,      vr0,      vr0  // t1
    vilvh.d       vr3,      vr2,      vr2  // t3
endfuncl

.macro inv_identity4_lsx1 in0, in1, in2, out0, out1
    vsllwil.w.h   vr16,     \in0,     0
    vexth.w.h     vr17,     \in1
    vmul.w        vr18,     vr16,     \in2
    vmul.w        vr19,     vr17,     \in2
    vsrari.w      vr18,     vr18,     12
    vsrari.w      vr19,     vr19,     12
    vadd.w        \out0,    vr18,     vr16
    vadd.w        \out1,    vr19,     vr17
    vssrarni.h.w  \out1,    \out0,    1
.endm

functionl inv_identity_8h_x4_lsx
    li.w          t0,        1697
    vreplgr2vr.h  vr20,      t0
    vmulwev.w.h   vr16,      vr0,     vr20
    vmulwod.w.h   vr17,      vr0,     vr20
    vmulwev.w.h   vr18,      vr1,     vr20
    vmulwod.w.h   vr19,      vr1,     vr20
    vilvl.w       vr21,      vr17,    vr16
    vilvh.w       vr22,      vr17,    vr16
    vilvl.w       vr23,      vr19,    vr18
    vilvh.w       vr16,      vr19,    vr18
    vssrarni.h.w  vr22,      vr21,    12
    vssrarni.h.w  vr16,      vr23,    12
    vsadd.h       vr0,       vr22,    vr0  // t0
    vsadd.h       vr1,       vr16,    vr1  // t1
    vmulwev.w.h   vr16,      vr2,     vr20
    vmulwod.w.h   vr17,      vr2,     vr20
    vmulwev.w.h   vr18,      vr3,     vr20
    vmulwod.w.h   vr19,      vr3,     vr20
    vilvl.w       vr21,      vr17,    vr16
    vilvh.w       vr22,      vr17,    vr16
    vilvl.w       vr23,      vr19,    vr18
    vilvh.w       vr16,      vr19,    vr18
    vssrarni.h.w  vr22,      vr21,    12
    vssrarni.h.w  vr16,      vr23,    12
    vsadd.h       vr2,       vr22,    vr2  // t2
    vsadd.h       vr3,       vr16,    vr3  // t3
endfuncl

functionl inv_identity_8h_x4_lsx1
    li.w          t0,        1697
    vreplgr2vr.w  vr20,      t0
.irp i, vr0, vr1, vr2, vr3
    inv_identity4_lsx1 \i, \i vr20, vr21, \i
.endr
endfuncl

functionl inv_txfm_add_4x4_lsx
    vxor.v        vr23,     vr23,     vr23
    vld           vr0,      a2,       0
    vld           vr2,      a2,       16
    vilvh.d       vr1,      vr0,      vr0
    vilvh.d       vr3,      vr2,      vr2
    vst           vr23,     a2,       0
    vst           vr23,     a2,       16

    move          t6,       ra
    jirl          ra,       t7,       0
    move          ra,       t6

    LSX_TRANSPOSE4x4_H vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, vr4, vr5

    move          t6,       ra
    jirl          ra,       t8,       0
    move          ra,       t6

    vilvl.d       vr4,      vr1,      vr0
    vilvl.d       vr5,      vr3,      vr2
    vsrari.h      vr4,      vr4,      4
    vsrari.h      vr5,      vr5,      4
    alsl.d        t2,       a1,       a0,    1
    VLD_DST_ADD_W4 vr4, vr5
endfuncl

.macro idct_dc w, h, shift
    ld.h          t2,       a2,       0      // dc
    vldi          vr0,      0x8b5            // 181
    vreplgr2vr.w  vr1,      t2
    vldi          vr20,     0x880            // 128
    vmul.w        vr2,      vr0,      vr1    // dc * 181
    st.h          zero,     a2,       0
    vsrari.w      vr2,      vr2,      8      // (dc * 181 + 128) >> 8
    vld           vr10,     a0,       0      // 0 1 2 3 4 5 6 7

.if (2*\w == \h) || (2*\h == \w)
    vmul.w        vr2,      vr0,      vr2
    vsrari.w      vr2,      vr2,      8      // (dc * 181 + 128) >> 8
.endif
.if \shift>0
    vsrari.w      vr2,      vr2,      \shift      // (dc + rnd) >> shift
.endif
    vldx          vr11,     a0,       a1     // 8 9 10 11 12 13 14 15
    alsl.d        t2,       a1,       a0,    1
    vmadd.w       vr20,     vr2,      vr0
    vld           vr12,     t2,       0      // 16 17 18 19 20 21 22 23
    vssrarni.h.w  vr20,     vr20,     12
    vldx          vr13,     t2,       a1     // 24 25 26 27 28 29 30 31
.endm

.macro fun4x4 txfm1, txfm2
function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_lsx
.ifc \txfm1\()_\txfm2, dct_dct
    bnez          a3,       1f

    idct_dc 4, 4, 0

    DST_ADD_W4    vr10, vr11, vr12, vr13, vr20, vr20
    b             .\txfm1\()_\txfm2\()_4X4_END
1:
.endif

    la.local     t7,    inv_\txfm1\()_4h_x4_lsx
    la.local     t8,    inv_\txfm2\()_4h_x4_lsx

    b            inv_txfm_add_4x4_lsx
.\txfm1\()_\txfm2\()_4X4_END:
endfunc
.endm

fun4x4 dct, dct
fun4x4 identity, identity
fun4x4 adst, dct
fun4x4 dct, adst
fun4x4 adst, adst
fun4x4 dct, flipadst
fun4x4 flipadst, adst
fun4x4 adst, flipadst
fun4x4 flipadst, dct
fun4x4 flipadst, flipadst
fun4x4 dct, identity
fun4x4 identity, dct
fun4x4 flipadst, identity
fun4x4 identity, flipadst
fun4x4 identity, adst
fun4x4 adst, identity

const iadst8_coeffs_h, align=4
    .short          4076, 401, 3612, 1931
    .short          2598, 3166, 1189, 3920
    .short          2896, 0, 1567, 3784, 0, 0, 0, 0
endconst

.macro inv_adst8_lsx out0, out1, out2, out3, out4, out5, out6, out7, sz
    la.local      t0,       iadst8_coeffs_h

    vldrepl.h     vr20,     t0,       0     // 4076
    vldrepl.h     vr21,     t0,       2     // 401
    vmulev_vmaddod_lsx vr7, vr0, vr20, vr21, vr16, vr17, \sz
    vneg.h        vr20,     vr20
    vmulev_vmaddod_lsx vr7, vr0, vr21, vr20, vr18, vr19, \sz
    vssrarni.h.w  vr17,     vr16,     12    // t0a
    vssrarni.h.w  vr19,     vr18,     12    // t1a

    vldrepl.h     vr20,     t0,       4     // 3612
    vldrepl.h     vr21,     t0,       6     // 1931
    vmulev_vmaddod_lsx vr5, vr2, vr20, vr21, vr0, vr16, \sz
    vneg.h        vr20,     vr20
    vmulev_vmaddod_lsx vr5, vr2, vr21, vr20, vr7, vr18, \sz
    vssrarni.h.w  vr16,     vr0,      12    // t2a
    vssrarni.h.w  vr18,     vr7,      12    // t3a

    vldrepl.h     vr20,     t0,       8     // 2598
    vldrepl.h     vr21,     t0,       10    // 3166
    vmulev_vmaddod_lsx vr3, vr4, vr20, vr21, vr2, vr0, \sz
    vneg.h        vr20,     vr20
    vmulev_vmaddod_lsx vr3, vr4, vr21, vr20, vr5, vr7, \sz
    vssrarni.h.w  vr0,      vr2,      12    // t4a
    vssrarni.h.w  vr7,      vr5,      12    // t5a

    vldrepl.h     vr20,     t0,       12    // 1189
    vldrepl.h     vr21,     t0,       14    // 3920
    vmulev_vmaddod_lsx vr1, vr6, vr20, vr21, vr3, vr2, \sz
    vneg.h        vr20,     vr20
    vmulev_vmaddod_lsx vr1, vr6, vr21, vr20, vr4, vr5, \sz
    vssrarni.h.w  vr2,      vr3,      12    // t6a
    vssrarni.h.w  vr5,      vr4,      12    // t7a

    vsadd.h       vr3,      vr17,     vr0   // t0
    vssub.h       vr4,      vr17,     vr0   // t4
    vsadd.h       vr1,      vr19,     vr7   // t1
    vssub.h       vr6,      vr19,     vr7   // t5
    vsadd.h       vr17,     vr16,     vr2   // t2
    vssub.h       vr19,     vr16,     vr2   // t6
    vsadd.h       vr0,      vr18,     vr5   // t3
    vssub.h       vr7,      vr18,     vr5   // t7

    la.local      t0,       idct_coeffs_h

    vldrepl.h     vr20,     t0,       4     // 1567
    vldrepl.h     vr21,     t0,       6     // 3784
    vmulev_vmaddod_lsx vr4, vr6, vr21, vr20, vr16, vr5, \sz
    vneg.h        vr21,     vr21
    vmulev_vmaddod_lsx vr4, vr6, vr20, vr21, vr18, vr2, \sz
    vssrarni.h.w  vr5,      vr16,     12    // t4a
    vssrarni.h.w  vr2,      vr18,     12    // t5a

    vneg.h        vr21,     vr21
    vmulev_vmaddod_lsx vr7, vr19, vr20, vr21, vr4, vr16, \sz
    vneg.h        vr20,     vr20
    vmulev_vmaddod_lsx vr7, vr19, vr21, vr20, vr6, vr18, \sz
    vssrarni.h.w  vr16,     vr4,      12    // t7a
    vssrarni.h.w  vr18,     vr6,      12    // t6a

    vsadd.h       vr4,      vr5,      vr18  // out1
    vssub.h       vr19,     vr5,      vr18  // t6
    vsadd.h       vr20,     vr1,      vr0   // out7
    vssub.h       vr18,     vr1,      vr0   // t3
    vsadd.h       \out0,    vr3,      vr17  // out0
    vssub.h       vr5,      vr3,      vr17  // t2
    vsadd.h       \out6,    vr2,      vr16  // out6
    vssub.h       vr23,     vr2,      vr16  // t7

    vsllwil.w.h   vr3,      vr20,     0     // out7
    vexth.w.h     \out7,    vr20            // out7
    vsllwil.w.h   vr21,     vr4,      0     // out1
    vexth.w.h     \out1,    vr4             // out1
    vneg.w        vr3,      vr3
    vneg.w        \out7,    \out7
    vneg.w        vr21,     vr21
    vneg.w        \out1,    \out1
    vssrarni.h.w  \out7,    vr3,      0
    vssrarni.h.w  \out1,    vr21,     0

    la.local      t0,       idct_coeffs_h

    vldrepl.h     vr20,     t0,       0     // 2896
    vmulev_vmaddod_lsx vr5, vr18, vr20, vr20, vr16, \out3, \sz
    vneg.h        vr21,     vr20
    vmulev_vmaddod_lsx vr5, vr18, vr20, vr21, vr17, \out4, \sz
    vsrari.w      vr16,     vr16,     12
    vsrari.w      \out3,    \out3,    12
    vneg.w        vr16,     vr16
    vneg.w        \out3,    \out3
    vssrarni.h.w  \out3,    vr16,     0     // out3
    vssrarni.h.w  \out4,    vr17,     12    // out4

    vmulev_vmaddod_lsx vr19, vr23, vr20, vr20, vr16, \out2, \sz
    vmulev_vmaddod_lsx vr19, vr23, vr20, vr21, vr17, \out5, \sz
    vssrarni.h.w  \out2,    vr16,     12    // out2
    vsrari.w      vr17,     vr17,     12
    vsrari.w      \out5,    \out5,    12
    vneg.w        vr17,     vr17
    vneg.w        \out5,    \out5
    vssrarni.h.w  \out5,    vr17,     0     // out5
.endm

functionl inv_adst_8h_x8_lsx
    inv_adst8_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, .8h
endfuncl

functionl inv_flipadst_8h_x8_lsx
    inv_adst8_lsx vr7, vr6, vr5, vr4, vr3, vr2, vr1, vr0, .8h
endfuncl

functionl inv_adst_4h_x8_lsx
    inv_adst8_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, .8h
endfuncl

functionl inv_flipadst_4h_x8_lsx
    inv_adst8_lsx vr7, vr6, vr5, vr4, vr3, vr2, vr1, vr0, .8h
endfuncl

.macro inv_dct8_lsx in0, in1, in2, in3, in4, in5, in6, in7, sz
    inv_dct4_lsx \in0, \in2, \in4, \in6, \in0, \in2, \in4, \in6, \sz

    la.local      t0,       idct_coeffs_h

    vldrepl.h     vr20,     t0,       8        // 799
    vldrepl.h     vr21,     t0,       10       // 4017
    vmulev_vmaddod_lsx  \in1, \in7, vr21, vr20, vr16, vr17, \sz
    vneg.h        vr21,     vr21
    vmulev_vmaddod_lsx  \in1, \in7, vr20, vr21, vr18, vr19, \sz
    vssrarni.h.w  vr17,     vr16,     12       // t7a
    vssrarni.h.w  vr19,     vr18,     12       // t4a

    vldrepl.h     vr20,     t0,       12       // 3406
    vldrepl.h     vr21,     t0,       14       // 2276
    vmulev_vmaddod_lsx  \in5, \in3, vr21, vr20, \in1, vr16, \sz
    vneg.h        vr21,     vr21
    vmulev_vmaddod_lsx  \in5, \in3, vr20, vr21, \in7, vr18, \sz
    vssrarni.h.w  vr16,     \in1,       12      // t6a
    vssrarni.h.w  vr18,     \in7,       12      // t5a

    vssub.h       \in7,     vr19,      vr18     // t5a
    vsadd.h       vr18,     vr19,      vr18     // t4
    vssub.h       \in5,     vr17,      vr16     // t6a
    vsadd.h       vr16,     vr17,      vr16     // t7

    vldrepl.h     vr20,     t0,        0        // 2896
    vmulev_vmaddod_lsx  \in5, \in7, vr20, vr20, \in1, vr17, \sz
    vneg.h        vr21,     vr20
    vmulev_vmaddod_lsx  \in5, \in7, vr20, vr21, vr23, vr19, \sz
    vssrarni.h.w  vr17,     \in1,      12       // t6
    vssrarni.h.w  vr19,     vr23,      12       // t5

    vssub.h       \in7,      \in0,     vr16     //c[7]
    vsadd.h       \in0,      \in0,     vr16     //c[0]
    vssub.h       \in5,      \in4,     vr19     //c[5]
    vsadd.h       vr23,      \in4,     vr19     //c[2]
    vssub.h       \in4,      \in6,     vr18     //c[4]
    vsadd.h       \in3,      \in6,     vr18     //c[3]
    vssub.h       \in6,      \in2,     vr17     //c[6]
    vsadd.h       \in1,      \in2,     vr17     //c[1]
    vor.v         \in2,      vr23,     vr23
.endm

functionl inv_dct_8h_x8_lsx
    inv_dct8_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, .8h
endfuncl

functionl inv_dct_4h_x8_lsx
    inv_dct8_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, .4h
endfuncl

.macro DST_ADD_W8 in0, in1, in2, in3, in4, in5, in6, in7
    vsllwil.hu.bu vr0,      \in0,     0
    vsllwil.hu.bu vr1,      \in1,     0
    vsllwil.hu.bu vr2,      \in2,     0
    vsllwil.hu.bu vr3,      \in3,     0
    vadd.h        vr0,      \in4,     vr0
    vadd.h        vr1,      \in5,     vr1
    vadd.h        vr2,      \in6,     vr2
    vadd.h        vr3,      \in7,     vr3
    vssrani.bu.h  vr1,      vr0,      0
    vssrani.bu.h  vr3,      vr2,      0
    vstelm.d      vr1,      a0,       0,    0
    vstelmx.d     vr1,      a0,       a1,   1
    vstelmx.d     vr3,      a0,       a1,   0
    vstelmx.d     vr3,      a0,       a1,   1
.endm

.macro VLD_DST_ADD_W8 in0, in1, in2, in3
    vld           vr0,      a0,       0
    vldx          vr1,      a0,       a1
    vld           vr2,      t2,       0
    vldx          vr3,      t2,       a1

    DST_ADD_W8 vr0, vr1, vr2, vr3, \in0, \in1, \in2, \in3
.endm

functionl inv_identity_8h_x8_lsx
.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
    vsadd.h       \i,       \i,       \i
.endr
endfuncl

functionl inv_identity_4h_x8_lsx
.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
    vsadd.h       \i,       \i,       \i
.endr
endfuncl

.macro def_fn_8x8_base variant
functionl inv_txfm_\variant\()add_8x8_lsx
    vxor.v  vr23, vr23, vr23
    vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
.irp i, 0, 16, 32, 48, 64, 80, 96, 112
    vst           vr23,     a2,       \i
.endr

.ifc \variant, identity_
    // The identity shl #1 and downshift srshr #1 cancel out
    b             .itx_8x8_epilog
.else

    move          t6,       ra
    jirl          ra,       t7,       0
    move          ra,       t6

.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
    vsrari.h      \i,       \i,       1
.endr

.itx_8x8_epilog:
    LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23

    move          t6,       ra
    jirl          ra,       t8,       0
    move          ra,       t6

    vsrari_h_x8 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
                vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23, 4

    alsl.d        t2,       a1,       a0,     1
    VLD_DST_ADD_W8 vr16, vr17, vr18, vr19
    add.d         a0,       a0,       a1
    alsl.d        t2,       a1,       a0,     1
    VLD_DST_ADD_W8 vr20, vr21, vr22, vr23
.endif
endfuncl
.endm

def_fn_8x8_base identity_
def_fn_8x8_base

.macro fn8x8 txfm1, txfm2
function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_lsx
.ifc \txfm1\()_\txfm2, dct_dct
    bnez          a3,       .NO_HAS_DCONLY_8x8

    idct_dc 8, 8, 1

    DST_ADD_W8 vr10, vr11, vr12, vr13, vr20, vr20, vr20, vr20

    add.d         a0,       a1,       a0
    alsl.d        t2,       a1,       a0,     1
    VLD_DST_ADD_W8 vr20, vr20, vr20, vr20

    b             .\txfm1\()_\txfm2\()_8X8_END
.NO_HAS_DCONLY_8x8:
.endif
    la.local      t8,       inv_\txfm2\()_8h_x8_lsx
.ifc \txfm1, identity
    b             inv_txfm_identity_add_8x8_lsx
.else
    la.local      t7,       inv_\txfm1\()_8h_x8_lsx
    b             inv_txfm_add_8x8_lsx
.endif
.\txfm1\()_\txfm2\()_8X8_END:
endfunc
.endm

fn8x8 dct, dct
fn8x8 identity, identity
fn8x8 dct, adst
fn8x8 dct, flipadst
fn8x8 dct, identity
fn8x8 adst, dct
fn8x8 adst, adst
fn8x8 adst, flipadst
fn8x8 flipadst, dct
fn8x8 flipadst, adst
fn8x8 flipadst, flipadst
fn8x8 identity, dct
fn8x8 adst, identity
fn8x8 flipadst, identity
fn8x8 identity, adst
fn8x8 identity, flipadst

.macro rect2_lsx in0, in1, out0
    vsllwil.w.h   vr22,     \in0,     0     // in1
    vexth.w.h     \in0,     \in0            // in1
    vmul.w        vr22,     vr22,     \in1
    vmul.w        \out0,    \in0,     \in1
    vssrarni.h.w  \out0,    vr22,     12
.endm

.macro LSX_TRANSPOSE8x4_H in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
                          out2, out3, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5
    vilvl.h       \tmp0,    \in1,     \in0
    vilvl.h       \tmp1,    \in3,     \in2
    vilvl.w       \tmp2,    \tmp1,    \tmp0
    vilvh.w       \tmp3,    \tmp1,    \tmp0
    vilvl.h       \tmp0,    \in5,     \in4
    vilvl.h       \tmp1,    \in7,     \in6
    vilvl.w       \tmp4,    \tmp1,    \tmp0
    vilvh.w       \tmp5,    \tmp1,    \tmp0
    vilvl.d       \out0,    \tmp4,    \tmp2
    vilvh.d       \out1,    \tmp4,    \tmp2
    vilvl.d       \out2,    \tmp5,    \tmp3
    vilvh.d       \out3,    \tmp5,    \tmp3
.endm

functionl inv_txfm_add_8x4_lsx
    vxor.v        vr23,     vr23,     vr23
    vld           vr0,      a2,       0
    vld           vr2,      a2,       16
    vld           vr4,      a2,       32
    vld           vr6,      a2,       48
.irp i, 0, 16, 32, 48
    vst           vr23,     a2,       \i
.endr

    li.w          t0,       2896
    vreplgr2vr.w  vr23,     t0
    rect2_lsx     vr0,      vr23,     vr0
    rect2_lsx     vr2,      vr23,     vr2
    rect2_lsx     vr4,      vr23,     vr4
    rect2_lsx     vr6,      vr23,     vr6

    vilvh.d       vr1,      vr0,      vr0
    vilvh.d       vr3,      vr2,      vr2
    vilvh.d       vr5,      vr4,      vr4
    vilvh.d       vr7,      vr6,      vr6

    move          t6,       ra
    jirl          ra,       t7,       0
    move          ra,       t6

    LSX_TRANSPOSE8x4_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr0, vr1, \
                       vr2, vr3, vr16, vr17, vr18, vr19, vr20, vr21

    move          t6,       ra
    jirl          ra,       t8,       0
    move          ra,       t6

    vsrari_h_x4 vr0, vr1, vr2, vr3, vr16, vr17, vr18, vr19, 4

    alsl.d        t2,       a1,       a0,     1
    VLD_DST_ADD_W8 vr16, vr17, vr18, vr19
endfuncl

.macro LSX_TRANSPOSE4x8_H in0, in1, in2, in3, out0, out1, out2, out3, out4, \
                          out5, out6, out7, tmp0, tmp1, tmp2, tmp3
    vilvl.h       \tmp0,    \in1,     \in0
    vilvl.h       \tmp1,    \in3,     \in2
    vilvh.h       \tmp2,    \in1,     \in0
    vilvh.h       \tmp3,    \in3,     \in2
    vilvl.w       \out0,    \tmp1,    \tmp0
    vilvh.w       \out2,    \tmp1,    \tmp0
    vilvl.w       \out4,    \tmp3,    \tmp2
    vilvh.w       \out6,    \tmp3,    \tmp2

    vbsrl.v       \out1,    \out0,    8
    vbsrl.v       \out3,    \out2,    8
    vbsrl.v       \out5,    \out4,    8
    vbsrl.v       \out7,    \out6,    8
    vinsgr2vr.d   \out0,    zero,     1
    vinsgr2vr.d   \out2,    zero,     1
    vinsgr2vr.d   \out4,    zero,     1
    vinsgr2vr.d   \out6,    zero,     1
.endm

functionl inv_txfm_add_4x8_lsx
    vxor.v        vr23,     vr23,     vr23
    vld           vr0,      a2,       0
    vld           vr1,      a2,       16
    vld           vr2,      a2,       32
    vld           vr3,      a2,       48
.irp i, 0, 16, 32, 48
    vst           vr23,     a2,       \i
.endr

    li.w          t0,       2896
    vreplgr2vr.w  vr23,     t0
    rect2_lsx     vr0,      vr23,     vr0
    rect2_lsx     vr1,      vr23,     vr1
    rect2_lsx     vr2,      vr23,     vr2
    rect2_lsx     vr3,      vr23,     vr3

    move          t6,       ra
    jirl          ra,       t7,       0
    move          ra,       t6

    LSX_TRANSPOSE4x8_H vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, vr4, vr5, \
                       vr6, vr7, vr16, vr17, vr18, vr19

    move          t6,       ra
    jirl          ra,       t8,       0
    move          ra,       t6

    vilvl.d       vr0,      vr1,      vr0
    vilvl.d       vr1,      vr3,      vr2
    vilvl.d       vr2,      vr5,      vr4
    vilvl.d       vr3,      vr7,      vr6

    vsrari_h_x4 vr0, vr1, vr2, vr3, vr16, vr17, vr18, vr19, 4

    alsl.d        t2,       a1,       a0,    1
    VLD_DST_ADD_W4 vr16, vr17
    add.d         a0,       a1,       a0
    alsl.d        t2,       a1,       a0,    1
    VLD_DST_ADD_W4 vr18, vr19
endfuncl

.macro fn8x4 txfm1, txfm2
function inv_txfm_add_\txfm1\()_\txfm2\()_8x4_8bpc_lsx
.ifc \txfm1()_\txfm2, dct_dct
    bnez          a3,       .NO_HAS_DCONLY_8x4

    idct_dc 8, 4, 0

    DST_ADD_W8 vr10, vr11, vr12, vr13, vr5, vr5, vr5, vr5

    b             .\txfm1\()_\txfm2\()_8X4_END
.NO_HAS_DCONLY_8x4:
.endif
    la.local      t7,       inv_\txfm1\()_4h_x8_lsx
    la.local      t8,       inv_\txfm2\()_8h_x4_lsx
    b             inv_txfm_add_8x4_lsx
.\txfm1\()_\txfm2\()_8X4_END:
endfunc
.endm

fn8x4 dct, dct
fn8x4 identity, identity
fn8x4 dct, adst
fn8x4 dct, flipadst
fn8x4 dct, identity
fn8x4 adst, dct
fn8x4 adst, adst
fn8x4 adst, flipadst
fn8x4 flipadst, dct
fn8x4 flipadst, adst
fn8x4 flipadst, flipadst
fn8x4 identity, dct
fn8x4 adst, identity
fn8x4 flipadst, identity
fn8x4 identity, adst
fn8x4 identity, flipadst

.macro fn4x8 txfm1, txfm2
function inv_txfm_add_\txfm1\()_\txfm2\()_4x8_8bpc_lsx
.ifc \txfm1()_\txfm2, dct_dct
    bnez          a3,       .NO_HAS_DCONLY_4x8

    idct_dc 4, 8, 0

    DST_ADD_W4 vr10, vr11, vr12, vr13, vr20, vr20

    add.d         a0,       a0,       a1
    alsl.d        t2,       a1,       a0,   1
    VLD_DST_ADD_W4 vr5, vr5
    b             .\txfm1\()_\txfm2\()_4X8_END
.NO_HAS_DCONLY_4x8:
.endif
    la.local      t7,       inv_\txfm1\()_8h_x4_lsx
    la.local      t8,       inv_\txfm2\()_4h_x8_lsx
    b             inv_txfm_add_4x8_lsx
.\txfm1\()_\txfm2\()_4X8_END:
endfunc
.endm

fn4x8 dct, dct
fn4x8 identity, identity
fn4x8 dct, adst
fn4x8 dct, flipadst
fn4x8 dct, identity
fn4x8 adst, dct
fn4x8 adst, adst
fn4x8 adst, flipadst
fn4x8 flipadst, dct
fn4x8 flipadst, adst
fn4x8 flipadst, flipadst
fn4x8 identity, dct
fn4x8 adst, identity
fn4x8 flipadst, identity
fn4x8 identity, adst
fn4x8 identity, flipadst

.macro inv_identity4_lsx_x2 in0, in1, in2, in3, in4, out0, out1
    vsllwil.w.h   vr4,      \in0,    0
    vexth.w.h     vr5,      \in0
    vsllwil.w.h   vr6,      \in1,    0
    vexth.w.h     vr7,      \in1
    vmul.w        vr4,      vr4,     \in2
    vmul.w        vr5,      vr5,     \in2
    vmul.w        vr6,      vr6,     \in2
    vmul.w        vr7,      vr7,     \in2
    vssrarni.h.w  vr5,      vr4,     12
    vssrarni.h.w  vr7,      vr6,     12
    vsadd.h       \out0,    vr5,     \in3
    vsadd.h       \out1,    vr7,     \in4
.endm

.macro vmul_vmadd_w in0, in1, in2, in3, out0, out1
    vsllwil.w.h   vr22,     \in0,     0
    vexth.w.h     vr23,     \in0
    vmul.w        \out0,    vr22,     \in2
    vmul.w        \out1,    vr23,     \in2
    vsllwil.w.h   vr22,     \in1,     0
    vexth.w.h     vr23,     \in1
    vmadd.w       \out0,    vr22,     \in3
    vmadd.w       \out1,    vr23,     \in3
.endm

.macro vmul_vmsub_w in0, in1, in2, in3, out0, out1
    vsllwil.w.h   vr22,     \in0,     0
    vexth.w.h     vr23,     \in0
    vmul.w        \out0,    vr22,     \in2
    vmul.w        \out1,    vr23,     \in2
    vsllwil.w.h   vr22,     \in1,     0
    vexth.w.h     vr23,     \in1
    vmsub.w       \out0,    vr22,     \in3
    vmsub.w       \out1,    vr23,     \in3
.endm

.macro inv_dct16_lsx sz
    inv_dct8_lsx vr0, vr2, vr4, vr6, vr8, vr10, vr12, vr14, \sz

    la.local      t0,       idct_coeffs_h
    vldrepl.h     vr20,     t0,       16        // 401
    vldrepl.h     vr21,     t0,       18        // 4076
    vmulev_vmaddod_lsx vr1, vr15, vr21, vr20, vr16, vr17, \sz
    vneg.h        vr21,     vr21
    vmulev_vmaddod_lsx vr1, vr15, vr20, vr21, vr18, vr19, \sz
    vssrarni.h.w  vr17,     vr16,     12        // t15a
    vssrarni.h.w  vr19,     vr18,     12        // t8a
    vldrepl.h     vr20,     t0,       20        // 3166 -> 1583
    vldrepl.h     vr21,     t0,       22        // 2598 -> 1299
    vmulev_vmaddod_lsx vr9, vr7, vr21, vr20, vr1, vr16, \sz
    vneg.h        vr21,     vr21
    vmulev_vmaddod_lsx vr9, vr7, vr20, vr21, vr15, vr18, \sz
    vssrarni.h.w  vr16,     vr1,      12        // t14a
    vssrarni.h.w  vr18,     vr15,     12        // t9a
    vldrepl.h     vr20,     t0,       24        // 1931
    vldrepl.h     vr21,     t0,       26        // 3612
    vmulev_vmaddod_lsx vr5, vr11, vr21, vr20, vr7, vr1, \sz
    vneg.h        vr21,     vr21
    vmulev_vmaddod_lsx vr5, vr11, vr20, vr21, vr9, vr15, \sz
    vssrarni.h.w  vr1,      vr7,      12        // t13a
    vssrarni.h.w  vr15,     vr9,      12        // t10a
    vldrepl.h     vr20,     t0,       28        // 3920
    vldrepl.h     vr21,     t0,       30        // 1189
    vmulev_vmaddod_lsx vr13, vr3, vr21, vr20, vr5, vr7, \sz
    vneg.h        vr21,     vr21
    vmulev_vmaddod_lsx vr13, vr3, vr20, vr21, vr11, vr9, \sz
    vssrarni.h.w  vr7,      vr5,      12        // t12a
    vssrarni.h.w  vr9,      vr11,     12        // t11a

    vsadd.h       vr5,      vr19,     vr18     // t8
    vssub.h       vr11,     vr19,     vr18     // t9
    vssub.h       vr3,      vr9,      vr15     // t10
    vsadd.h       vr13,     vr9,      vr15     // t11
    vsadd.h       vr18,     vr7,      vr1      // t12
    vssub.h       vr19,     vr7,      vr1      // t13
    vssub.h       vr9,      vr17,     vr16     // t14
    vsadd.h       vr15,     vr17,     vr16     // t15

    vldrepl.h     vr20,     t0,       4        // 1567
    vldrepl.h     vr21,     t0,       6        // 3784
    vmulev_vmaddod_lsx vr9, vr11, vr21, vr20, vr1, vr16, \sz
    vneg.h        vr21,     vr21
    vmulev_vmaddod_lsx vr9, vr11, vr20, vr21, vr7, vr17, \sz
    vssrarni.h.w  vr16,     vr1,      12       // t14a
    vssrarni.h.w  vr17,     vr7,      12       // t9a

    vneg.h        vr21,     vr21
    vmulev_vmaddod_lsx vr19, vr3, vr21, vr20, vr9, vr1, \sz
    vneg.h        vr21,     vr21
    vmulev_vmaddod_lsx vr19, vr3, vr20, vr21, vr11, vr7, \sz
    vneg.w        vr1,      vr1
    vneg.w        vr9,      vr9
    vssrarni.h.w  vr7,      vr11,     12       // t13a
    vssrarni.h.w  vr1,      vr9,      12       // t10a
    vsadd.h       vr9,      vr5,      vr13     // t8a
    vssub.h       vr11,     vr5,      vr13     // t11a
    vssub.h       vr3,      vr15,     vr18     // t12a
    vsadd.h       vr19,     vr15,     vr18     // t15a
    vsadd.h       vr5,      vr17,     vr1      // t9
    vssub.h       vr13,     vr17,     vr1      // t10
    vssub.h       vr15,     vr16,     vr7      // t13
    vsadd.h       vr18,     vr16,     vr7      // t14

    vldrepl.h     vr20,     t0,       0        // 2896
    vmulev_vmaddod_lsx vr15, vr13, vr20, vr20, vr1, vr7, \sz
    vneg.h        vr21,     vr20
    vmulev_vmaddod_lsx vr15, vr13, vr20, vr21, vr17, vr16, \sz
    vssrarni.h.w  vr7,      vr1,      12       // t13a
    vssrarni.h.w  vr16,     vr17,     12       // t10a

    vmulev_vmaddod_lsx vr3, vr11, vr20, vr20, vr13, vr23, \sz
    vmulev_vmaddod_lsx vr3, vr11, vr20, vr21, vr15, vr17, \sz
    vssrarni.h.w  vr23,     vr13,     12       // t12
    vssrarni.h.w  vr17,     vr15,     12       // t11

    vssub.h       vr15,     vr0,     vr19      // c[15]
    vsadd.h       vr0,      vr0,     vr19      // c[0]
    vsadd.h       vr1,      vr2,     vr18      // c[1]
    vssub.h       vr20,     vr2,     vr18      // c[14]
    vsadd.h       vr2,      vr4,     vr7       // c[2]
    vssub.h       vr13,     vr4,     vr7       // c[13]
    vsadd.h       vr3,      vr6,     vr23      // c[3]
    vssub.h       vr21,     vr6,     vr23      // c[12]
    vsadd.h       vr4,      vr8,     vr17      // c[4]
    vssub.h       vr11,     vr8,     vr17      // c[11]
    vsadd.h       vr7,      vr14,    vr9       // c[7]
    vssub.h       vr8,      vr14,    vr9       // c[8]
    vsadd.h       vr6,      vr12,    vr5       // c[6]
    vssub.h       vr9,      vr12,    vr5       // c[9]
    vsadd.h       vr5,      vr10,    vr16      // c[5]
    vssub.h       vr10,     vr10,    vr16      // c[10]
    vor.v         vr14,     vr20,    vr20
    vor.v         vr12,     vr21,    vr21
.endm

functionl inv_dct_8h_x16_lsx
    inv_dct16_lsx .8h
endfuncl

functionl inv_dct_4h_x16_lsx
    inv_dct16_lsx .4h
endfuncl

.macro VLD_DST_ADD_W4_x4 in0, in1, in2, in3, in4, in5, in6 ,in7
    alsl.d        t2,       a1,       a0,    1

    VLD_DST_ADD_W4 \in0, \in1

    add.d         a0,       a1,       a0
    alsl.d        t2,       a1,       a0,    1
    VLD_DST_ADD_W4 \in2, \in3

    add.d         a0,       a1,       a0
    alsl.d        t2,       a1,       a0,    1
    VLD_DST_ADD_W4 \in4, \in5

    add.d         a0,       a1,       a0
    alsl.d        t2,       a1,       a0,    1
    VLD_DST_ADD_W4 \in6, \in7
.endm

.macro def_fn_4x16_base txfm
functionl inv_txfm_\txfm\()add_4x16_lsx
    PUSH_REG
    blt           a3,       t5,       416f
    vld           vr0,      a2,       16
    vld           vr1,      a2,       48
    vld           vr2,      a2,       80
    vld           vr3,      a2,       112
    vxor.v        vr23,     vr23,     vr23
.irp i, 16, 48, 80, 112
    vst           vr23,     a2,       \i
.endr

    move          t6,       ra
    jirl          ra,       t7,       0
    move          ra,       t6

.ifnc \txfm, identity_
    vsrari.h      vr0,      vr0,      1
    vsrari.h      vr1,      vr1,      1
    vsrari.h      vr2,      vr2,      1
    vsrari.h      vr3,      vr3,      1
.endif

    LSX_TRANSPOSE4x8_H vr0, vr1, vr2, vr3, vr8, vr9, vr24, vr25, vr26, \
                       vr27, vr14, vr28, vr10, vr11, vr12, vr13

416:
    ble           t5,       a3,       416416f
.irp i, vr8, vr9, vr24, vr25, vr26, vr27, vr14, vr28
    vxor.v        \i,       \i,       \i
.endr

416416:
    vld           vr0,      a2,       0
    vld           vr1,      a2,       32
    vld           vr2,      a2,       64
    vld           vr3,      a2,       96
    vxor.v        vr23,     vr23,     vr23
.irp i, 0, 32, 64, 96
    vst           vr23,     a2,       \i
.endr

    move          t6,       ra
    jirl          ra,       t7,       0
    move          ra,       t6

.ifnc \txfm, identity_
    vsrari.h      vr0,      vr0,      1
    vsrari.h      vr1,      vr1,      1
    vsrari.h      vr2,      vr2,      1
    vsrari.h      vr3,      vr3,      1
.endif

    LSX_TRANSPOSE4x8_H vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, vr4, vr5, \
                       vr6, vr7, vr16, vr17, vr18, vr19

    vor.v         vr10,     vr24,     vr24
    vor.v         vr11,     vr25,     vr25
    vor.v         vr12,     vr26,     vr26
    vor.v         vr13,     vr27,     vr27
    vor.v         vr15,     vr28,     vr28

    move          t6,       ra
    jirl          ra,       t8,       0
    move          ra,       t6

    vilvl.d       vr16,     vr1,      vr0
    vilvl.d       vr17,     vr3,      vr2
    vilvl.d       vr18,     vr5,      vr4
    vilvl.d       vr19,     vr7,      vr6
    vilvl.d       vr20,     vr9,      vr8
    vilvl.d       vr21,     vr11,     vr10
    vilvl.d       vr22,     vr13,     vr12
    vilvl.d       vr23,     vr15,     vr14

.irp i, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
    vsrari.h     \i,       \i,       4
.endr

    VLD_DST_ADD_W4_x4 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
    POP_REG
endfuncl
.endm

def_fn_4x16_base identity_
def_fn_4x16_base

.macro fn4x16 txfm1, txfm2, eob_half
function inv_txfm_add_\txfm1\()_\txfm2\()_4x16_8bpc_lsx
.ifc \txfm1()_\txfm2, dct_dct
    bnez          a3,       .NO_HAS_DCONLY_4x16

    idct_dc 4, 16, 1

    DST_ADD_W4 vr10, vr11, vr12, vr13, vr5, vr5

.rept 3
    add.d         a0,       a1,       a0
    alsl.d        t2,       a1,       a0,   1

    VLD_DST_ADD_W4 vr5, vr5
.endr
    b             .\txfm1\()_\txfm2\()_4X16_END

.NO_HAS_DCONLY_4x16:
.endif
    li.w          t5,       \eob_half
    la.local      t7,       inv_\txfm1\()_8h_x4_lsx
.ifc \txfm1, identity
    la.local      t7,       inv_\txfm1\()_8h_x4_lsx1
.endif
    la.local      t8,       inv_\txfm2\()_4h_x16_lsx

.ifc \txfm1, identity
    b             inv_txfm_identity_add_4x16_lsx
.else
    b             inv_txfm_add_4x16_lsx
.endif
.\txfm1\()_\txfm2\()_4X16_END:
endfunc
.endm

fn4x16 dct, dct, 29
fn4x16 identity, identity, 29
fn4x16 dct, adst, 29
fn4x16 dct, flipadst, 29
fn4x16 dct, identity, 8
fn4x16 adst, dct, 29
fn4x16 adst, adst, 29
fn4x16 adst, flipadst, 29
fn4x16 flipadst, dct, 29
fn4x16 flipadst, adst, 29
fn4x16 flipadst, flipadst, 29
fn4x16 identity, dct, 32
fn4x16 adst, identity, 8
fn4x16 flipadst, identity, 8
fn4x16 identity, adst, 32
fn4x16 identity, flipadst, 32

.macro inv_identity16_lsx in0, in1, in2, out0, sz
.ifc \sz, .8h
    vsllwil.w.h   vr16,     \in0,     0
    vexth.w.h     vr17,     \in0
    vmul.w        vr16,     vr16,     \in1
    vmul.w        vr17,     vr17,     \in1
    vsadd.h       \in2,     \in2,     \in2
    vssrarni.h.w  vr17,     vr16,     11
    vsadd.h       \out0,    vr17,     \in2
.else
    vsllwil.w.h   vr16,     \in0,     0
    vmul.w        vr16,     vr16,     \in1
    vsadd.h       \in2,     \in2,     \in2
    vssrarni.h.w  vr16,     vr16,     11
    vsadd.h       \out0,    vr16,     \in2
.endif
.endm

.macro inv_identity16_lsx1 in0, in1, in2, out0
    vsllwil.w.h   vr16,     \in0,     0
    vexth.w.h     vr17,     \in1
    vmul.w        vr18,     vr16,     \in2
    vmul.w        vr19,     vr17,     \in2
    vsrari.w      vr18,     vr18,     11
    vsrari.w      vr19,     vr19,     11
    vslli.w       vr16,     vr16,     1
    vslli.w       vr17,     vr17,     1
    vadd.w        vr16,     vr18,     vr16
    vadd.w        \out0,    vr19,     vr17
    vssrarni.h.w  \out0,    vr16,     1
.endm

functionl inv_identity_8h_x16_lsx
    li.w          t0,       1697
    vreplgr2vr.w  vr20,     t0
.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr8, \
    vr9, vr10, vr11, vr12, vr13, vr14, vr15
    inv_identity16_lsx \i, vr20, \i, \i, .8h
.endr
endfuncl

functionl inv_identity_4h_x16_lsx
    li.w          t0,       1697
    vreplgr2vr.w  vr20,     t0
.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr8, \
    vr9, vr10, vr11, vr12, vr13, vr14, vr15
    inv_identity16_lsx \i, vr20, \i, \i, .4h
.endr
endfuncl

functionl inv_identity_8h_x16_lsx1
    li.w          t0,       1697
    vreplgr2vr.w  vr20,     t0
.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr8, \
    vr9, vr10, vr11, vr12, vr13, vr14, vr15
    inv_identity16_lsx1 \i, \i, vr20, \i
.endr
endfuncl

const iadst16_coeffs_h, align=4
    .short         4091, 201, 3973, 995
    .short         3703, 1751, 3290, 2440
    .short         2751, 3035, 2106, 3513
    .short         1380, 3857, 601, 4052
endconst

.macro inv_adst16_lsx txfm, sz
    la.local      t0,       iadst16_coeffs_h
    vldrepl.h     vr20,     t0,        0        // 4091
    vldrepl.h     vr21,     t0,        2        // 201
    vmulev_vmaddod_lsx vr15, vr0, vr20, vr21, vr16, vr18, \sz
    vneg.h        vr20,     vr20
    vmulev_vmaddod_lsx vr15, vr0, vr21, vr20, vr17, vr19, \sz
    vssrarni.h.w  vr18,     vr16,      12       // t0
    vssrarni.h.w  vr19,     vr17,      12       // t1
    vldrepl.h     vr20,     t0,        4        // 3973
    vldrepl.h     vr21,     t0,        6        // 995
    vmulev_vmaddod_lsx vr13, vr2, vr20, vr21, vr16, vr0, \sz
    vneg.h        vr20,     vr20
    vmulev_vmaddod_lsx vr13, vr2, vr21, vr20, vr17, vr15, \sz
    vssrarni.h.w  vr0,      vr16,      12       // t2
    vssrarni.h.w  vr15,     vr17,      12       // t3
    vldrepl.h     vr20,     t0,        8       // 3703
    vldrepl.h     vr21,     t0,        10       // 1751
    vmulev_vmaddod_lsx vr11, vr4, vr20, vr21, vr16, vr2, \sz
    vneg.h        vr20,     vr20
    vmulev_vmaddod_lsx vr11, vr4, vr21, vr20, vr17, vr13, \sz
    vssrarni.h.w  vr2,      vr16,      12       // t4
    vssrarni.h.w  vr13,     vr17,      12       // t5
    vldrepl.h     vr20,     t0,        12       // 3290 -> 1645
    vldrepl.h     vr21,     t0,        14       // 2440 -> 1220
    vmulev_vmaddod_lsx vr9, vr6, vr20, vr21, vr16, vr4, \sz
    vneg.h        vr20,     vr20
    vmulev_vmaddod_lsx vr9, vr6, vr21, vr20, vr17, vr11, \sz
    vssrarni.h.w  vr4,      vr16,      12       // t6
    vssrarni.h.w  vr11,     vr17,      12       // t7
    vldrepl.h     vr20,     t0,        16       // 2751
    vldrepl.h     vr21,     t0,        18       // 3035
    vmulev_vmaddod_lsx vr7, vr8, vr20, vr21, vr16, vr6, \sz
    vneg.h        vr20,     vr20
    vmulev_vmaddod_lsx vr7, vr8, vr21, vr20, vr17, vr9, \sz
    vssrarni.h.w  vr6,      vr16,      12       // t8
    vssrarni.h.w  vr9,      vr17,      12       // t9
    vldrepl.h     vr20,     t0,        20       // 2106
    vldrepl.h     vr21,     t0,        22       // 3513
    vmulev_vmaddod_lsx vr5, vr10, vr20, vr21, vr16, vr7, \sz
    vneg.h        vr20,     vr20
    vmulev_vmaddod_lsx vr5, vr10, vr21, vr20, vr17, vr8, \sz
    vssrarni.h.w  vr7,      vr16,      12       // t10
    vssrarni.h.w  vr8,      vr17,      12       // t11
    vldrepl.h     vr20,     t0,        24       // 1380
    vldrepl.h     vr21,     t0,        26       // 3857
    vmulev_vmaddod_lsx vr3, vr12, vr20, vr21, vr16, vr5, \sz
    vneg.h        vr20,     vr20
    vmulev_vmaddod_lsx vr3, vr12, vr21, vr20, vr17, vr10, \sz
    vssrarni.h.w  vr5,      vr16,      12       // t12
    vssrarni.h.w  vr10,     vr17,      12       // t13
    vldrepl.h     vr20,     t0,        28       // 601
    vldrepl.h     vr21,     t0,        30       // 4052
    vmulev_vmaddod_lsx vr1, vr14, vr20, vr21, vr16, vr3, \sz
    vneg.h        vr20,     vr20
    vmulev_vmaddod_lsx vr1, vr14, vr21, vr20, vr17, vr12, \sz
    vssrarni.h.w  vr3,      vr16,      12       // t14
    vssrarni.h.w  vr12,     vr17,      12       // t15

    vsadd.h       vr1,      vr18,      vr6      // t0a
    vssub.h       vr14,     vr18,      vr6      // t8a
    vsadd.h       vr16,     vr19,      vr9      // t1a
    vssub.h       vr17,     vr19,      vr9      // t9a
    vsadd.h       vr6,      vr0,       vr7      // t2a
    vssub.h       vr18,     vr0,       vr7      // t10a
    vsadd.h       vr9,      vr15,      vr8      // t3a
    vssub.h       vr19,     vr15,      vr8      // t11a
    vsadd.h       vr0,      vr2,       vr5      // t4a
    vssub.h       vr7,      vr2,       vr5      // t12a
    vsadd.h       vr8,      vr13,      vr10     // t5a
    vssub.h       vr15,     vr13,      vr10     // t13a
    vsadd.h       vr2,      vr4,       vr3      // t6a
    vssub.h       vr5,      vr4,       vr3      // t14a
    vsadd.h       vr10,     vr11,      vr12     // t7a
    vssub.h       vr13,     vr11,      vr12     // t15a

    la.local      t0,       idct_coeffs_h

    vldrepl.h     vr20,     t0,        8        // 799
    vldrepl.h     vr21,     t0,        10       // 4017
    vmulev_vmaddod_lsx vr14, vr17, vr21, vr20, vr3, vr11, \sz
    vneg.h        vr21,     vr21
    vmulev_vmaddod_lsx vr14, vr17, vr20, vr21, vr4, vr12, \sz
    vssrarni.h.w  vr11,     vr3,       12       // t8
    vssrarni.h.w  vr12,     vr4,       12       // t9
    vneg.h        vr21,     vr21
    vmulev_vmaddod_lsx vr15, vr7, vr20, vr21, vr3, vr14, \sz
    vneg.h        vr20,     vr20
    vmulev_vmaddod_lsx vr15, vr7, vr21, vr20, vr4, vr17, \sz
    vssrarni.h.w  vr14,     vr3,       12       // t13
    vssrarni.h.w  vr17,     vr4,       12       // t12
    vldrepl.h     vr20,     t0,        12       // 3406
    vldrepl.h     vr21,     t0,        14       // 2276
    vmulev_vmaddod_lsx vr18, vr19, vr21, vr20, vr3, vr7, \sz
    vneg.h        vr21,     vr21
    vmulev_vmaddod_lsx vr18, vr19, vr20, vr21, vr4, vr15, \sz
    vssrarni.h.w  vr7,      vr3,       12       // t10
    vssrarni.h.w  vr15,     vr4,       12       // t11
    vneg.h        vr21,     vr21
    vmulev_vmaddod_lsx vr13, vr5, vr20, vr21, vr3, vr18, \sz
    vneg.h        vr20,     vr20
    vmulev_vmaddod_lsx vr13, vr5, vr21, vr20, vr4, vr19, \sz
    vssrarni.h.w  vr18,     vr3,       12       // t15
    vssrarni.h.w  vr19,     vr4,       12       // t14

    vsadd.h       vr5,      vr1,       vr0      // t0
    vssub.h       vr13,     vr1,       vr0      // t4
    vsadd.h       vr3,      vr16,      vr8      // t1
    vssub.h       vr4,      vr16,      vr8      // t5
    vsadd.h       vr0,      vr6,       vr2      // t2
    vssub.h       vr1,      vr6,       vr2      // t6
    vsadd.h       vr8,      vr9,       vr10     // t3
    vssub.h       vr16,     vr9,       vr10     // t7
    vsadd.h       vr2,      vr11,      vr17     // t8a
    vssub.h       vr6,      vr11,      vr17     // t12a
    vsadd.h       vr9,      vr12,      vr14     // t9a
    vssub.h       vr10,     vr12,      vr14     // t13a
    vsadd.h       vr11,     vr7,       vr19     // t10a
    vssub.h       vr17,     vr7,       vr19     // t14a
    vsadd.h       vr12,     vr15,      vr18     // t11a
    vssub.h       vr14,     vr15,      vr18     // t15a

    vldrepl.h     vr20,     t0,        4        // 1567
    vldrepl.h     vr21,     t0,        6       // 3784
    vmulev_vmaddod_lsx vr13, vr4, vr21, vr20, vr7, vr18, \sz
    vneg.h        vr21,     vr21
    vmulev_vmaddod_lsx vr13, vr4, vr20, vr21, vr15, vr19, \sz
    vssrarni.h.w  vr18,     vr7,       12       // t4a
    vssrarni.h.w  vr19,     vr15,      12       // t5a
    vneg.h        vr21,     vr21
    vmulev_vmaddod_lsx vr16, vr1, vr20, vr21, vr7, vr4, \sz
    vneg.h        vr20,     vr20
    vmulev_vmaddod_lsx vr16, vr1, vr21, vr20, vr15, vr13, \sz
    vssrarni.h.w  vr4,      vr7,       12       // t7a
    vssrarni.h.w  vr13,     vr15,      12       // t6a
    vneg.h        vr20,     vr20
    vmulev_vmaddod_lsx vr6, vr10, vr21, vr20, vr7, vr1, \sz
    vneg.h        vr21,     vr21
    vmulev_vmaddod_lsx vr6, vr10, vr20, vr21, vr15, vr16, \sz
    vssrarni.h.w  vr1,      vr7,       12       // t12
    vssrarni.h.w  vr16,     vr15,      12       // t13
    vneg.h        vr21,     vr21
    vmulev_vmaddod_lsx vr14, vr17, vr20, vr21, vr7, vr6, \sz
    vneg.h        vr20,     vr20
    vmulev_vmaddod_lsx vr14, vr17, vr21, vr20, vr15, vr10, \sz
    vssrarni.h.w  vr6,      vr7,       12       // t15
    vssrarni.h.w  vr10,     vr15,      12       // t14

    vssub.h       vr17,     vr5,       vr0      // t2a
    vsadd.h       vr14,     vr5,       vr0      // out[0]
    vssub.h       vr7,      vr3,       vr8      // t3a
    vsadd.h       vr15,     vr3,       vr8      // out[15]
    vsllwil.w.h   vr22,     vr15,      0
    vexth.w.h     vr15,     vr15
    vneg.w        vr22,     vr22
    vneg.w        vr15,     vr15
    vssrarni.h.w  vr15,     vr22,      0        // out[15]

    vsadd.h       vr3,      vr19,      vr4      // out[12]
    vssub.h       vr8,      vr19,      vr4      // t7
    vssub.h       vr0,      vr18,      vr13     // t6
    vsadd.h       vr5,      vr18,      vr13     // out[3]
    vsllwil.w.h   vr22,     vr5,       0
    vexth.w.h     vr5,      vr5
    vneg.w        vr22,     vr22
    vneg.w        vr5,      vr5
    vssrarni.h.w  vr5,      vr22,      0        // out[3]

    vsadd.h       vr13,     vr9,       vr12     // out[14]
    vssub.h       vr19,     vr9,       vr12     // t11
    vssub.h       vr4,      vr2,       vr11     // t10
    vsadd.h       vr18,     vr2,       vr11     // out[1]
    vsllwil.w.h   vr22,     vr18,      0
    vexth.w.h     vr18,     vr18
    vneg.w        vr22,     vr22
    vneg.w        vr18,     vr18
    vssrarni.h.w  vr18,     vr22,      0        // out[1]

    vsadd.h       vr2,      vr1,       vr10     // out[2]
    vssub.h       vr11,     vr1,       vr10     // t14a
    vssub.h       vr12,     vr16,      vr6      // t15a
    vsadd.h       vr9,      vr16,      vr6      // out[13]
    vsllwil.w.h   vr22,     vr9,       0
    vexth.w.h     vr9,      vr9
    vneg.w        vr22,     vr22
    vneg.w        vr9,      vr9
    vssrarni.h.w  vr9,      vr22,      0        // out[13]

    vldrepl.h     vr20,     t0,        0        // 2896
    vmulev_vmaddod_lsx vr17, vr7, vr20, vr20, vr6, vr10, \sz
    vneg.h        vr21,     vr20
    vmulev_vmaddod_lsx vr17, vr7, vr20, vr21, vr16, vr1, \sz
    vssrarni.h.w  vr1,      vr16,      12       // out[8]
    vsrari.w      vr6,      vr6,       12
    vsrari.w      vr10,     vr10,      12
    vneg.w        vr6,      vr6
    vneg.w        vr10,     vr10
    vssrarni.h.w  vr10,     vr6,       0        // out[7]
    vmulev_vmaddod_lsx vr0, vr8, vr20, vr21, vr16, vr17, \sz
    vmulev_vmaddod_lsx vr0, vr8, vr20, vr20, vr6, vr7, \sz
    vssrarni.h.w  vr7,      vr6,       12       // out[4]
    vsrari.w      vr16,     vr16,      12
    vsrari.w      vr17,     vr17,      12
    vneg.w        vr16,     vr16
    vneg.w        vr17,     vr17
    vssrarni.h.w  vr17,     vr16,       0        // out[11]

    vmulev_vmaddod_lsx vr4, vr19, vr20, vr21, vr16, vr0, \sz
    vmulev_vmaddod_lsx vr4, vr19, vr20, vr20, vr6, vr8, \sz
    vssrarni.h.w  vr8,      vr6,       12       // out[6]
    vsrari.w      vr16,     vr16,      12
    vsrari.w      vr0,      vr0,       12
    vneg.w        vr16,     vr16
    vneg.w        vr0,      vr0
    vssrarni.h.w  vr0,      vr16,      0    // out[9]

    vmulev_vmaddod_lsx vr11, vr12, vr20, vr20, vr6, vr4, \sz
    vmulev_vmaddod_lsx vr11, vr12, vr20, vr21, vr16, vr19, \sz
    vssrarni.h.w  vr19,     vr16,      12       // out[10]
    vsrari.w      vr6,      vr6,       12
    vsrari.w      vr4,      vr4,       12
    vneg.w        vr6,      vr6
    vneg.w        vr4,      vr4
    vssrarni.h.w  vr4,      vr6,       0        // out[5]

.ifc \txfm, adst
    vor.v         vr12,     vr3,       vr3
    vor.v         vr3,      vr5,       vr5
    vor.v         vr5,      vr4,       vr4
    vor.v         vr4,      vr7,       vr7
    vor.v         vr7,      vr10,      vr10
    vor.v         vr10,     vr19,      vr19
    vor.v         vr6,      vr8,       vr8
    vor.v         vr8,      vr1,       vr1
    vor.v         vr11,     vr17,      vr17
    vor.v         vr20,     vr13,      vr13
    vor.v         vr13,     vr9,       vr9
    vor.v         vr9,      vr0,       vr0
    vor.v         vr0,      vr14,      vr14
    vor.v         vr14,     vr20,      vr20
    vor.v         vr1,      vr18,      vr18
.else
    vor.v         vr6,      vr0,       vr0
    vor.v         vr0,      vr15,      vr15
    vor.v         vr15,     vr14,      vr14
    vor.v         vr14,     vr18,      vr18
    vor.v         vr11,     vr7,       vr7
    vor.v         vr7,      vr1,       vr1
    vor.v         vr1,      vr13,      vr13
    vor.v         vr13,     vr2,       vr2
    vor.v         vr2,      vr9,       vr9
    vor.v         vr9,      vr8,       vr8
    vor.v         vr8,      vr10,      vr10
    vor.v         vr10,     vr4,       vr4
    vor.v         vr4,      vr17,      vr17
    vor.v         vr12,     vr5,       vr5
    vor.v         vr5,      vr19,      vr19
.endif
.endm // inv_adst16_lsx

functionl inv_adst_8h_x16_lsx
    inv_adst16_lsx adst, 8h
endfuncl

functionl inv_flipadst_8h_x16_lsx
    inv_adst16_lsx flipadst, 8h
endfuncl

functionl inv_adst_4h_x16_lsx
    inv_adst16_lsx adst, 4h
endfuncl

functionl inv_flipadst_4h_x16_lsx
    inv_adst16_lsx flipadst, 4h
endfuncl

.macro VLD_DST_ADD_W8_x4 in0, in1, in2, in3, in4, in5, in6, in7, in8, \
                         in9, in10, in11, in12, in13, in14, in15

    alsl.d        t2,       a1,       a0,    1
    VLD_DST_ADD_W8 \in0, \in1, \in2, \in3

    add.d         a0,       a1,       a0
    alsl.d        t2,       a1,       a0,    1
    VLD_DST_ADD_W8 \in4, \in5, \in6, \in7

    add.d         a0,       a1,       a0
    alsl.d        t2,       a1,       a0,    1
    VLD_DST_ADD_W8 \in8, \in9, \in10, \in11

    add.d         a0,       a1,       a0
    alsl.d        t2,       a1,       a0,    1
    VLD_DST_ADD_W8 \in12, \in13, \in14, \in15
.endm

.macro def_base_8x16 txfm1
functionl inv_txfm_\txfm1\()add_8x16_lsx
    blt     a3,    t5,   816f
    vld_x8 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
    vxor.v        vr23,     vr23,     vr23
.irp i, 16, 48, 80, 112, 144, 176, 208, 240
    vst           vr23,     a2,       \i
.endr

    li.w          t0,       2896
    vreplgr2vr.w  vr23,     t0
.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
    rect2_lsx     \i,       vr23,     \i
.endr

.ifc \txfm1, identity_
    LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
                       vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
.else
    move          t6,       ra
    jirl          ra,       t7,       0
    move          ra,       t6

    vsrari_h_x8 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
                vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, 1

    LSX_TRANSPOSE8x8_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
                       vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
.endif

816:
    ble       t5,    a3,  816816f
.irp i, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
    vxor.v    \i,  \i,  \i
.endr

816816:
    vld_x8 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
    vxor.v        vr23,     vr23,     vr23
.irp i, 0, 32, 64, 96, 128, 160, 192, 224
    vst           vr23,     a2,       \i
.endr

    li.w          t0,       2896
    vreplgr2vr.w  vr23,     t0
.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
    rect2_lsx     \i,       vr23,     \i
.endr

.ifc \txfm1, identity_

.else
    move          t6,       ra
    jirl          ra,       t7,       0
    move          ra,       t6

.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
    vsrari.h      \i,       \i,       1
.endr
.endif

    LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23

    move          t6,       ra
    jirl          ra,       t8,       0
    move          ra,       t6

    vor.v   vr0, vr0, vr0
    vsrari_h_x8 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
                vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23, 4
    vsrari_h_x8 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
                vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, 4

    VLD_DST_ADD_W8_x4 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
                      vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
endfuncl
.endm

def_base_8x16 identity_
def_base_8x16

.macro DST_ADD_W16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11
    vsllwil.hu.bu vr4,      \in0,     0
    vexth.hu.bu   vr0,      \in0
    vsllwil.hu.bu vr5,      \in1,     0
    vexth.hu.bu   vr1,      \in1
    vsllwil.hu.bu vr6,      \in2,     0
    vexth.hu.bu   vr2,      \in2
    vsllwil.hu.bu vr7,      \in3,     0
    vexth.hu.bu   vr3,      \in3
    vadd.h        vr4,      vr4,      \in4
    vadd.h        vr0,      vr0,      \in5
    vadd.h        vr5,      vr5,      \in6
    vadd.h        vr1,      vr1,      \in7
    vadd.h        vr6,      vr6,      \in8
    vadd.h        vr2,      vr2,      \in9
    vadd.h        vr7,      vr7,      \in10
    vadd.h        vr3,      vr3,      \in11
    vssrani.bu.h  vr0,      vr4,      0
    vssrani.bu.h  vr1,      vr5,      0
    vssrani.bu.h  vr2,      vr6,      0
    vssrani.bu.h  vr3,      vr7,      0
    vst           vr0,      a0,       0
    vstx          vr1,      a0,       a1
    vst           vr2,      t2,       0
    vstx          vr3,      t2,       a1
.endm

.macro VLD_DST_ADD_W16 in0, in1, in2, in3, in4, in5, in6, in7
    vld           vr0,      a0,       0
    vldx          vr1,      a0,       a1
    vld           vr2,      t2,       0
    vldx          vr3,      t2,       a1
    DST_ADD_W16 vr0, vr1, vr2, vr3, \in0, \in1, \in2, \in3, \
                \in4, \in5, \in6, \in7
.endm

.macro def_fn_16x8 txfm1
functionl inv_txfm_\txfm1\()add_16x8_lsx
    PUSH_REG

    vld_x16 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
    vxor.v        vr23,     vr23,     vr23
.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, \
    176, 192, 208, 224, 240
    vst           vr23,     a2,       \i
.endr

    li.w          t0,       2896
    vreplgr2vr.w  vr23,     t0
.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
    vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
    rect2_lsx     \i,       vr23,     \i
.endr

    move          t6,       ra
    jirl          ra,       t7,       0
    move          ra,       t6

.ifnc \txfm1, identity_
.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
    vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
    vsrari.h       \i,       \i,       1
.endr
.endif

    LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23

    move          t6,       ra
    jirl          ra,       t8,       0
    move          ra,       t6

    vsrari_h_x8 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
                vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, 4

    LSX_TRANSPOSE8x8_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
    move          t6,       ra
    jirl          ra,       t8,       0
    move          ra,       t6

    vsrari_h_x8 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
                vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, 4

    alsl.d        t2,       a1,       a0,    1
    VLD_DST_ADD_W16 vr24, vr8, vr25, vr9, vr26, vr10, vr27, vr11

    alsl.d        a0,       a1,       a0,    2
    alsl.d        t2,       a1,       a0,    1
    VLD_DST_ADD_W16 vr28, vr12, vr29, vr13, vr30, vr14, vr31, vr15

    POP_REG
endfuncl
.endm

def_fn_16x8 identity_
def_fn_16x8

.macro fun16x8 txfm1, txfm2
function inv_txfm_add_\txfm1\()_\txfm2\()_16x8_8bpc_lsx
.ifc \txfm1\()_\txfm2, dct_dct
    bnez          a3,       .NO_HAS_DCONLY_16x8

    idct_dc 16, 8, 1

    DST_ADD_W16 vr10, vr11, vr12, vr13, vr20, vr20, vr20, \
                vr20, vr20, vr20, vr20, vr20

    alsl.d        a0,       a1,       a0,     2
    alsl.d        t2,       a1,       a0,     1
    VLD_DST_ADD_W16 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20,
    b             .\txfm1\()_\txfm2\()_16x8_END
.NO_HAS_DCONLY_16x8:
.endif

    la.local     t7,    inv_\txfm1\()_8h_x16_lsx
.ifc \txfm1, identity
    la.local     t7,    inv_identity_8h_x16_lsx1
.endif

    la.local     t8,    inv_\txfm2\()_8h_x8_lsx

.ifc \txfm1, identity
    b            inv_txfm_identity_add_16x8_lsx
.else
    b            inv_txfm_add_16x8_lsx
.endif

.\txfm1\()_\txfm2\()_16x8_END:
endfunc
.endm

fun16x8 dct, dct
fun16x8 identity, identity
fun16x8 dct, adst
fun16x8 dct, flipadst
fun16x8 dct, identity
fun16x8 adst, dct
fun16x8 adst, adst
fun16x8 adst, flipadst
fun16x8 flipadst, dct
fun16x8 flipadst, adst
fun16x8 flipadst, flipadst
fun16x8 identity, dct
fun16x8 adst, identity
fun16x8 flipadst, identity
fun16x8 identity, adst
fun16x8 identity, flipadst

.macro fun8x16 txfm1, txfm2, eob_half
function inv_txfm_add_\txfm1\()_\txfm2\()_8x16_8bpc_lsx
.ifc \txfm1\()_\txfm2, dct_dct
    bnez          a3,       .NO_HAS_DCONLY_8x16

    idct_dc 8, 16, 1

    DST_ADD_W8 vr10, vr11, vr12, vr13, vr20, vr20, vr20, vr20
.rept 3
    add.d         a0,       a1,       a0
    alsl.d        t2,       a1,       a0,     1
    VLD_DST_ADD_W8 vr20, vr20, vr20, vr20
.endr

    b             .\txfm1\()_\txfm2\()_8x16_END
.NO_HAS_DCONLY_8x16:
.endif
    li.w         t5,    \eob_half
.ifnc \txfm1, identity
    la.local     t7,    inv_\txfm1\()_8h_x8_lsx
.endif

    la.local     t8,    inv_\txfm2\()_8h_x16_lsx
.ifc \txfm1, identity
    b            inv_txfm_identity_add_8x16_lsx
.else
    b            inv_txfm_add_8x16_lsx
.endif
.\txfm1\()_\txfm2\()_8x16_END:
endfunc
.endm

fun8x16 dct, dct, 43
fun8x16 identity, identity, 43
fun8x16 dct, adst, 43
fun8x16 dct, flipadst, 43
fun8x16 dct, identity, 8
fun8x16 adst, dct, 43
fun8x16 adst, adst, 43
fun8x16 adst, flipadst, 43
fun8x16 flipadst, dct, 43
fun8x16 flipadst, adst, 43
fun8x16 flipadst, flipadst, 43
fun8x16 identity, dct, 64
fun8x16 adst, identity, 8
fun8x16 flipadst, identity, 8
fun8x16 identity, adst, 64
fun8x16 identity, flipadst, 64

functionl inv_txfm_add_16x16_lsx
    malloc_space 512

    addi.d        t1,       sp,       64
    addi.d        t2,       a2,       0
.rept 2
    vld_x16 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15

    vxor.v        vr23,     vr23,     vr23
.irp i, 0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, \
    384, 416, 448, 480
    vst           vr23,     a2,       \i
.endr

    move          t6,       ra
    jirl          ra,       t7,       0
    move          ra,       t6

    LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23

    LSX_TRANSPOSE8x8_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
                       vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23

.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
    vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
    vsrari.h       \i,       \i,       2
.endr
    vst_x8 t1, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
    vst_x8 t1, 16, 32, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
    addi.d         t1,       t1,       256
    addi.d         a2,       a2,       16
    blt            a3,       t5,       1616f
.endr

1616:
    ble           t5,       a3,       16161616f
    addi.d        t1,       sp,       320
    vxor.v        vr23,     vr23,     vr23
.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \
    240
    vst           vr23,     t1,       \i
.endr

16161616:
    addi.d        t1,       sp,       64
.rept 2
    vld_x16 t1, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15

    move          t6,       ra
    jirl          ra,       t8,       0
    move          ra,       t6

    vst_x16 t1, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15

    addi.d        t1,       t1,       16
.endr
    alsl.d        t2,       a1,       a0,    1
    addi.d        t1,       sp,       64
.rept 4
    vld_x8 t1, 0, 16, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
    vsrari_h_x8 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23, \
                vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 4
    VLD_DST_ADD_W16 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
    alsl.d        a0,       a1,       a0,    2
    alsl.d        t2,       a1,       a0,    1
    addi.d        t1,       t1,       128
.endr
    free_space 512
endfuncl

.macro fun16x16 txfm1, txfm2, eob_half
function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_lsx
.ifc \txfm1\()_\txfm2, dct_dct
    bnez          a3,       .NO_HAS_DCONLY_16x16

    idct_dc 16, 16, 2

    DST_ADD_W16 vr10, vr11, vr12, vr13, vr20, vr20, vr20, \
                    vr20, vr20, vr20, vr20, vr20
.rept 3
    alsl.d        a0,       a1,       a0,     2
    alsl.d        t2,       a1,       a0,     1

    VLD_DST_ADD_W16 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20
.endr
    b             .\txfm1\()_\txfm2\()_16x16_END
.NO_HAS_DCONLY_16x16:
.endif
    li.w         t5,    \eob_half
    la.local     t7,    inv_\txfm1\()_8h_x16_lsx
    la.local     t8,    inv_\txfm2\()_8h_x16_lsx

    b            inv_txfm_add_16x16_lsx
.\txfm1\()_\txfm2\()_16x16_END:
endfunc
.endm

fun16x16 dct, dct, 36
fun16x16 adst, adst, 36
fun16x16 adst, dct, 36
fun16x16 dct, adst, 36
fun16x16 flipadst, dct, 36
fun16x16 dct, flipadst, 36
fun16x16 adst, flipadst, 36
fun16x16 flipadst, adst, 36

.macro dct_8x32_core_lsx in1, in2, vld_st0, vld_st1, vld_stride, \
                         vst_st0, vst_st1, vst_st2, vst_st3, vst_stride, \
                         transpose8x8, shift
    la.local      t0,       idct_coeffs
    vldrepl.w     vr20,     t0,       64           // 201
    vldrepl.w     vr21,     t0,       68           // 4091
    vmul_vmadd_w vr0, vr30, vr21, vr20, vr8, vr9
    vmul_vmsub_w vr0, vr30, vr20, vr21, vr11, vr10
    vssrarni.h.w  vr9,      vr8,      12           // t31a
    vssrarni.h.w  vr10,     vr11,     12           // t16a
    vldrepl.w     vr20,     t0,       72           // 3035
    vldrepl.w     vr21,     t0,       76           // 2751
    vmul_vmadd_w vr19, vr7, vr21, vr20, vr8, vr0
    vmul_vmsub_w vr19, vr7, vr20, vr21, vr11, vr30
    vssrarni.h.w  vr0,      vr8,      12           // t30a
    vssrarni.h.w  vr30,     vr11,     12           // t17a
    vldrepl.w     vr20,     t0,       80           // 1751
    vldrepl.w     vr21,     t0,       84           // 3703
    vmul_vmadd_w vr4, vr26, vr21, vr20, vr8, vr7
    vmul_vmsub_w vr4, vr26, vr20, vr21, vr11, vr19
    vssrarni.h.w  vr7,      vr8,      12           // t29a
    vssrarni.h.w  vr19,     vr11,     12           // t18a
    vldrepl.w     vr20,     t0,       88           // 3857
    vldrepl.w     vr21,     t0,       92           // 1380
    vmul_vmadd_w vr27, vr3, vr21, vr20, vr8, vr4
    vmul_vmsub_w vr27, vr3, vr20, vr21, vr11, vr26
    vssrarni.h.w  vr4,      vr8,      12           // t28a
    vssrarni.h.w  vr26,     vr11,     12           // t19a
    vldrepl.w     vr20,     t0,       96           // 995
    vldrepl.w     vr21,     t0,       100          // 3973
    vmul_vmadd_w vr2, vr28, vr21, vr20, vr8, vr3
    vmul_vmsub_w vr2, vr28, vr20, vr21, vr11, vr27
    vssrarni.h.w  vr3,      vr8,      12           // t27a
    vssrarni.h.w  vr27,     vr11,     12           // t20a
    vldrepl.w     vr20,     t0,       104          // 3513
    vldrepl.w     vr21,     t0,       108          // 2106
    vmul_vmadd_w vr25, vr5, vr21, vr20, vr8, vr2
    vmul_vmsub_w vr25, vr5, vr20, vr21, vr11, vr28
    vssrarni.h.w  vr2,      vr8,      12           // t26a
    vssrarni.h.w  vr28,     vr11,     12           // t21a
    vldrepl.w     vr20,     t0,       112          // 2440 -> 1220
    vldrepl.w     vr21,     t0,       116          // 3290 -> 1645
    vmul_vmadd_w vr6, vr24, vr21, vr20, vr8, vr5
    vmul_vmsub_w vr6, vr24, vr20, vr21, vr11, vr25
    vssrarni.h.w  vr5,      vr8,      12           // t25a
    vssrarni.h.w  vr25,     vr11,     12           // t22a
    vldrepl.w     vr20,     t0,       120          // 4052
    vldrepl.w     vr21,     t0,       124          // 601
    vmul_vmadd_w vr29, vr1, vr21, vr20, vr8, vr6
    vmul_vmsub_w vr29, vr1, vr20, vr21, vr11, vr24
    vssrarni.h.w  vr6,      vr8,      12           // t24a
    vssrarni.h.w  vr24,     vr11,     12           // t23a

    vsadd.h       vr1,      vr10,     vr30         // t16
    vssub.h       vr29,     vr10,     vr30         // t17
    vssub.h       vr8,      vr26,     vr19         // t18
    vsadd.h       vr31,     vr26,     vr19         // t19
    vsadd.h       vr10,     vr27,     vr28         // t20
    vssub.h       vr30,     vr27,     vr28         // t21
    vssub.h       vr19,     vr24,     vr25         // t22
    vsadd.h       vr26,     vr24,     vr25         // t23
    vsadd.h       vr27,     vr6,      vr5          // t24
    vssub.h       vr28,     vr6,      vr5          // t25
    vssub.h       vr24,     vr3,      vr2          // t26
    vsadd.h       vr25,     vr3,      vr2          // t27
    vsadd.h       vr5,      vr4,      vr7          // t28
    vssub.h       vr6,      vr4,      vr7          // t29
    vssub.h       vr2,      vr9,      vr0          // t30
    vsadd.h       vr3,      vr9,      vr0          // t31

    vldrepl.w     vr20,     t0,       16           // 799
    vldrepl.w     vr21,     t0,       20           // 4017
    vmul_vmadd_w vr2, vr29, vr21, vr20, vr4, vr7
    vmul_vmsub_w vr2, vr29, vr20, vr21, vr11, vr0
    vssrarni.h.w  vr7,      vr4,      12           // t30a
    vssrarni.h.w  vr0,      vr11,     12           // t17a
    vmul_vmadd_w vr6, vr8, vr21, vr20, vr4, vr9
    vneg.w        vr4,      vr4
    vneg.w        vr9,      vr9
    vmul_vmsub_w vr6, vr8, vr20, vr21, vr11, vr2
    vssrarni.h.w  vr9,      vr4,      12           // t18a
    vssrarni.h.w  vr2,      vr11,     12           // t29a
    vldrepl.w     vr20,     t0,       24           // 3406 -> 1703
    vldrepl.w     vr21,     t0,       28           // 2276 -> 1138
    vmul_vmadd_w vr24, vr30, vr21, vr20, vr4, vr29
    vmul_vmsub_w vr24, vr30, vr20, vr21, vr11, vr6
    vssrarni.h.w  vr29,     vr4,      12           // t26a
    vssrarni.h.w  vr6,      vr11,     12           // t21a
    vmul_vmadd_w vr28, vr19, vr21, vr20, vr4, vr8
    vneg.w        vr4,      vr4
    vneg.w        vr8,      vr8
    vmul_vmsub_w vr28, vr19, vr20, vr21, vr11, vr24
    vssrarni.h.w  vr8,      vr4,      12           // t22a
    vssrarni.h.w  vr24,     vr11,     12           // t25a

    vsadd.h       vr4,      vr1,      vr31         // t16a
    vssub.h       vr30,     vr1,      vr31         // t19a
    vsadd.h       vr19,     vr0,      vr9          // t17
    vssub.h       vr28,     vr0,      vr9          // t18
    vssub.h       vr1,      vr26,     vr10         // t20a
    vsadd.h       vr31,     vr26,     vr10         // t23a
    vssub.h       vr0,      vr8,      vr6          // t21
    vsadd.h       vr9,      vr8,      vr6          // t22
    vsadd.h       vr10,     vr27,     vr25         // t24a
    vssub.h       vr26,     vr27,     vr25         // t27a
    vsadd.h       vr6,      vr24,     vr29         // t25
    vssub.h       vr8,      vr24,     vr29         // t26
    vssub.h       vr25,     vr3,      vr5          // t28a
    vsadd.h       vr27,     vr3,      vr5          // t31a
    vssub.h       vr24,     vr7,      vr2          // t29
    vsadd.h       vr29,     vr7,      vr2          // t30

    vldrepl.w     vr20,     t0,       8            // 1567
    vldrepl.w     vr21,     t0,       12           // 3784
    vmul_vmadd_w vr24, vr28, vr21, vr20, vr3, vr5
    vmul_vmsub_w vr24, vr28, vr20, vr21, vr11, vr2
    vssrarni.h.w  vr5,      vr3,      12           // t29a
    vssrarni.h.w  vr2,      vr11,     12           // 18a
    vmul_vmadd_w vr25, vr30, vr21, vr20, vr3, vr7
    vmul_vmsub_w vr25, vr30, vr20, vr21, vr11, vr24
    vssrarni.h.w  vr7,      vr3,      12           // t28
    vssrarni.h.w  vr24,     vr11,     12           // t19
    vmul_vmadd_w vr26, vr1, vr21, vr20, vr3, vr28
    vneg.w        vr3,      vr3
    vneg.w        vr28,     vr28
    vmul_vmsub_w vr26, vr1, vr20, vr21, vr11, vr25
    vssrarni.h.w  vr28,     vr3,      12           // t20
    vssrarni.h.w  vr25,     vr11,     12           // t27
    vmul_vmadd_w vr8, vr0, vr21, vr20, vr3, vr30
    vneg.w        vr3,      vr3
    vneg.w        vr30,     vr30
    vmul_vmsub_w vr8, vr0, vr20, vr21, vr11, vr1
    vssrarni.h.w  vr30,     vr3,      12           // t21a
    vssrarni.h.w  vr1,      vr11,     12           // t26a

    vsadd.h       vr3,      vr4,      vr31         // t16
    vssub.h       vr26,     vr4,      vr31         // t23
    vsadd.h       vr0,      vr19,     vr9          // t17a
    vssub.h       vr8,      vr19,     vr9          // t22a
    vsadd.h       vr4,      vr2,      vr30         // t18
    vssub.h       vr31,     vr2,      vr30         // t21
    vsadd.h       vr9,      vr24,     vr28         // t19a
    vssub.h       vr19,     vr24,     vr28         // t20a
    vssub.h       vr2,      vr27,     vr10         // t24
    vsadd.h       vr30,     vr27,     vr10         // t31
    vssub.h       vr24,     vr29,     vr6          // t25a
    vsadd.h       vr28,     vr29,     vr6          // t30a
    vssub.h       vr10,     vr5,      vr1          // t26
    vsadd.h       vr27,     vr5,      vr1          // t29
    vssub.h       vr6,      vr7,      vr25         // t27a
    vsadd.h       vr29,     vr7,      vr25         // t28a

    vldrepl.w     vr20,     t0,       0            // 2896
    vmul_vmsub_w vr6, vr19, vr20, vr20, vr1, vr5
    vmul_vmadd_w vr6, vr19, vr20, vr20, vr11, vr7
    vssrarni.h.w  vr5,      vr1,      12           // t20
    vssrarni.h.w  vr7,      vr11,     12           // t27
    vmul_vmsub_w vr10, vr31, vr20, vr20, vr1, vr25
    vmul_vmadd_w vr10, vr31, vr20, vr20, vr11, vr6
    vssrarni.h.w  vr25,     vr1,      12           // t21a
    vssrarni.h.w  vr6,      vr11,     12           // t26a
    vmul_vmsub_w vr24, vr8, vr20, vr20, vr1, vr19
    vmul_vmadd_w vr24, vr8, vr20, vr20, vr11, vr10
    vssrarni.h.w  vr19,     vr1,      12           // t22
    vssrarni.h.w  vr10,     vr11,     12           // t25
    vmul_vmsub_w vr2, vr26, vr20, vr20, vr1, vr31
    vmul_vmadd_w vr2, vr26, vr20, vr20, vr11, vr8
    vssrarni.h.w  vr31,     vr1,      12           // t23a
    vssrarni.h.w  vr8,      vr11,     12           // t24a

    // t31 t30a t29 t28a t27 t26a t25 t24a t23a t22 t21a t20 t19a t18 t17a t16
    // vr30 vr28 vr27 vr29 vr7 vr6 vr10 vr8 vr31 vr19 vr25 vr5 vr9 vr4 vr0 vr3
    vld_x8 \in2, \vld_st0, \vld_stride, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18

    vsadd.h       vr1,      vr11,     vr30         // c[0]
    vssub.h       vr2,      vr11,     vr30         // c[31]
    vsadd.h       vr24,     vr12,     vr28         // c[1]
    vssub.h       vr26,     vr12,     vr28         // c[30]
    vsadd.h       vr11,     vr13,     vr27         // c[2]
    vssub.h       vr30,     vr13,     vr27         // c[29]
    vsadd.h       vr12,     vr14,     vr29         // c[3]
    vssub.h       vr28,     vr14,     vr29         // c[28]
    vsadd.h       vr13,     vr15,     vr7          // c[4]
    vssub.h       vr27,     vr15,     vr7          // c[27]
    vsadd.h       vr14,     vr16,     vr6          // c[5]
    vssub.h       vr29,     vr16,     vr6          // c[26]
    vsadd.h       vr7,      vr17,     vr10         // c[6]
    vssub.h       vr15,     vr17,     vr10         // c[25]
    vsadd.h       vr6,      vr18,     vr8          // c[7]
    vssub.h       vr16,     vr18,     vr8          // c[24]

.ifnb \transpose8x8
    LSX_TRANSPOSE8x8_H vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \
                       vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \
                       vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23
.endif

.ifnb \shift
.irp i, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6
    vsrari.h      \i,       \i,       \shift
.endr
.endif

    vst_x8 \in1, \vst_st0, \vst_stride, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6

.ifnb \transpose8x8
    LSX_TRANSPOSE8x8_H vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \
                       vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \
                       vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23
.endif

.ifnb \shift
.irp i, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
    vsrari.h      \i,       \i,       \shift
.endr
.endif

    vst_x8 \in1, \vst_st1, \vst_stride, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2

    vld_x8 \in2, \vld_st1, \vld_stride, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18

    vsadd.h       vr1,      vr11,     vr31         // c[8]
    vssub.h       vr2,      vr11,     vr31         // c[23]
    vsadd.h       vr24,     vr12,     vr19         // c[9]
    vssub.h       vr26,     vr12,     vr19         // c[22]
    vsadd.h       vr11,     vr13,     vr25         // c[10]
    vssub.h       vr30,     vr13,     vr25         // c[21]
    vsadd.h       vr12,     vr14,     vr5          // c[11]
    vssub.h       vr28,     vr14,     vr5          // c[20]
    vsadd.h       vr13,     vr15,     vr9          // c[12]
    vssub.h       vr27,     vr15,     vr9          // c[19]
    vsadd.h       vr14,     vr16,     vr4          // c[13]
    vssub.h       vr29,     vr16,     vr4          // c[18]
    vsadd.h       vr7,      vr17,     vr0          // c[14]
    vssub.h       vr15,     vr17,     vr0          // c[17]
    vsadd.h       vr6,      vr18,     vr3          // c[15]
    vssub.h       vr16,     vr18,     vr3          // c[16]

.ifnb \transpose8x8
    LSX_TRANSPOSE8x8_H vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \
                       vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \
                       vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23
.endif

.ifnb \shift
.irp i, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6
    vsrari.h      \i,       \i,       \shift
.endr
.endif

    vst_x8 \in1, \vst_st2, \vst_stride, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6

.ifnb \transpose8x8
    LSX_TRANSPOSE8x8_H vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \
                       vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \
                       vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23
.endif

.ifnb \shift
.irp i, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
    vsrari.h      \i,       \i,       \shift
.endr
.endif

    vst_x8 \in1, \vst_st3, \vst_stride, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
.endm

const eob_32x32
        .short 36, 136, 300, 1024
endconst

const eob_8x32
        .short 43, 107, 171, 256
endconst

const eob_16x32
        .short 36, 151, 279, 512
endconst

.macro DST_ADD_W32 in0, in1, in2, in3, in4, in5, in6, in7
    vsllwil.hu.bu vr4,      vr10,     0
    vsllwil.hu.bu vr5,      vr11,     0
    vsllwil.hu.bu vr6,      vr12,     0
    vsllwil.hu.bu vr7,      vr13,     0
    vexth.hu.bu   vr10,     vr10
    vexth.hu.bu   vr11,     vr11
    vexth.hu.bu   vr12,     vr12
    vexth.hu.bu   vr13,     vr13
    vadd.h        vr4,      vr4,      \in0
    vadd.h        vr10,     vr10,     \in1
    vadd.h        vr5,      vr5,      \in2
    vadd.h        vr11,     vr11,     \in3
    vadd.h        vr6,      vr6,      \in4
    vadd.h        vr12,     vr12,     \in5
    vadd.h        vr7,      vr7,      \in6
    vadd.h        vr13,     vr13,     \in7
    vssrani.bu.h  vr10,     vr4,      0
    vssrani.bu.h  vr11,     vr5,      0
    vssrani.bu.h  vr12,     vr6,      0
    vssrani.bu.h  vr13,     vr7,      0
    vst           vr10,     a0,       0
    vst           vr11,     a0,       16
    vst           vr12,     t2,       0
    vst           vr13,     t2,       16
.endm

.macro idct_dc_w32 w, h, shift
    ld.h          t2,       a2,       0      // dc
    vldi          vr0,      0x8b5            // 181
    vreplgr2vr.w  vr1,      t2
    vldi          vr20,     0x880            // 128
    vmul.w        vr2,      vr0,      vr1    // dc * 181
    st.h          zero,     a2,       0
    add.d         t2,       a0,       a1
    vsrari.w      vr2,      vr2,      8      // (dc * 181 + 128) >> 8
    vld           vr13,     t2,       16

.if (2*\w == \h) || (2*\h == \w)
    vmul.w        vr2,      vr2,      vr0
    vsrari.w      vr2,      vr2,      8
.endif

.if \shift>0
    vsrari.w      vr2,      vr2,      \shift      // (dc + rnd) >> shift
.endif
    vld           vr11,     a0,       16
    vmadd.w       vr20,     vr2,      vr0
    vld           vr12,     t2,       0
    vssrarni.h.w  vr20,     vr20,     12
    vld           vr10,     a0,       0
.endm

function inv_txfm_add_dct_dct_32x8_8bpc_lsx
    bnez          a3,       .NO_HAS_DCONLY_32x8

    idct_dc_w32 32, 8, 2

    DST_ADD_W32 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20

.rept 3
    alsl.d        a0,       a1,       a0,     1
    add.d         t2,       a0,       a1
    vld           vr10,     a0,       0
    vld           vr11,     a0,       16
    vld           vr12,     t2,       0
    vld           vr13,     t2,       16
    DST_ADD_W32 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20
.endr
    b             .DCT_DCT_32X8_END
.NO_HAS_DCONLY_32x8:
    malloc_space 512+256

    addi.d        t1,       sp,       64
    addi.d        t2,       a2,       0
    addi.d        t3,       sp,       64
    addi.d        t3,       t3,       512

    vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15

    vxor.v        vr31,     vr31,     vr31
    vst_x16 t2, 0, 32, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
            vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31

    inv_dct16_lsx .8h

    vst_x16 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15

    vld_x16 t2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
            vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30

    vxor.v        vr31,     vr31,     vr31

    vst_x16 t2, 16, 32, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
            vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31

    dct_8x32_core_lsx t1, t3, 0, 128, 16, 0, 48, 16, 32, 64, transpose8x8, 2

    addi.d        t2,       sp,       64
.rept 4
    vld_x8 t2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7

    inv_dct8_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, .8h

.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
    vsrari.h      \i,       \i,       4
.endr

    vst_x8 t2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7

    addi.d        t2,       t2,       16
.endr

    addi.d        t0,       sp,       64
.rept 4
    add.d         t2,       a0,       a1
    vld           vr10,     a0,       0
    vld           vr11,     a0,       16
    vld           vr12,     t2,       0
    vld           vr13,     t2,       16
    vld_x8 t0, 0, 16, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
    DST_ADD_W32 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
    alsl.d        a0,       a1,       a0,     1
    addi.d        t0,       t0,       128
.endr
    free_space 512+256
.DCT_DCT_32X8_END:
endfunc

function inv_txfm_add_dct_dct_32x16_8bpc_lsx
    bnez          a3,       .NO_HAS_DCONLY_32x16

    idct_dc_w32 32, 16, 1

    DST_ADD_W32 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20

.rept 7
    alsl.d        a0,       a1,       a0,     1
    add.d         t2,       a0,       a1
    vld           vr10,     a0,       0
    vld           vr11,     a0,       16
    vld           vr12,     t2,       0
    vld           vr13,     t2,       16
    DST_ADD_W32 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20
.endr
    b             .DCT_DCT_32X16_END
.NO_HAS_DCONLY_32x16:
    malloc_space 1024+256                            // 32*32*2+512
    addi.d        t1,       sp,       64
    addi.d        t2,       a2,       0
    addi.d        t3,       sp,       64
    addi.d        t3,       t3,       1024
.rept 2
    vld_x16 t2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15

    vxor.v        vr31,     vr31,     vr31
    vst_x16 t2, 0, 64, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
            vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31

    li.w          t0,       2896
    vreplgr2vr.w  vr23,     t0
.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
     vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
    rect2_lsx   \i, vr23, \i
.endr

    inv_dct16_lsx .8h

    vst_x16 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15

    vld_x16 t2, 32, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
            vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30

    la.local      t0,       idct_coeffs
    vldrepl.w     vr23,     t0,       0        // 2896
.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
    vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
    rect2_lsx \i, vr23, \i
.endr
    vxor.v        vr31,     vr31,     vr31
    vst_x16 t2, 32, 64, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
            vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31

    dct_8x32_core_lsx t1, t3, 0, 128, 16, 0, 48, 16, 32, 64, transpose8x8, 1

    addi.d        t2,       t2,       16
    addi.d        t1,       t1,       512
.endr

    addi.d        t2,       sp,       64
.rept 4
    vld_x16 t2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15

    inv_dct16_lsx .8h

.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
    vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
    vsrari.h      \i,       \i,       4
.endr

    vst_x16 t2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15

    addi.d        t2,       t2,       16
.endr

    addi.d        t0,       sp,       64
.rept 8
    add.d         t2,       a0,       a1
    vld           vr10,     a0,       0
    vld           vr11,     a0,       16
    vld           vr12,     t2,       0
    vld           vr13,     t2,       16
    vld_x8 t0, 0, 16, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
    DST_ADD_W32 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23

    alsl.d        a0,       a1,       a0,     1
    addi.d        t0,       t0,       128
.endr
    free_space 1024+256
.DCT_DCT_32X16_END:
endfunc

function inv_txfm_add_dct_dct_32x32_8bpc_lsx
    bnez          a3,       .NO_HAS_DCONLY_32x32

    idct_dc_w32 32, 32, 2

    DST_ADD_W32 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20
.rept 15
    alsl.d        a0,       a1,       a0,     1
    add.d         t2,       a0,       a1
    vld           vr10,     a0,       0
    vld           vr11,     a0,       16
    vld           vr12,     t2,       0
    vld           vr13,     t2,       16
    DST_ADD_W32 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20
.endr
    b             .DCT_DCT_32X32_END
.NO_HAS_DCONLY_32x32:
    malloc_space 2560                              // 32*32*2+512

    addi.d        t1,       sp,       64
    addi.d        t2,       a2,       0
    addi.d        t3,       sp,       1024
    addi.d        t3,       t3,       1024
    addi.d        t3,       t3,       64

    la.local      t8,       eob_32x32
.DCT_DCT_EOB_32x32:
    ld.h          t7,       t8,       0
    addi.d        t8,       t8,       2

    vld_x16 t2, 0, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15

    vxor.v        vr31,     vr31,     vr31
    vst_x16 t2, 0, 128, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
            vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31

    inv_dct16_lsx .8h

    vst_x16 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15

    vld_x16 t2, 64, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
            vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30

    vxor.v        vr31,     vr31,     vr31

    vst_x16 t2, 64, 128, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
            vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31

    dct_8x32_core_lsx t1, t3, 0, 128, 16, 0, 48, 16, 32, 64, transpose8x8, 2

    addi.d        t2,       t2,       16
    addi.d        t1,       t1,       512
    bge           a3,       t7,       .DCT_DCT_EOB_32x32

    la.local      t8,       eob_32x32
    vxor.v        vr31,     vr31,     vr31
    ld.h          t7,       t8,       4
    bge           a3,       t7,       .DCT_DCT_EOB_32x32_END   // a3>=t7
    vst_x16 sp, 64+1536, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
        vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
    addi.d        t1,       sp,       256+64
    vst_x16 t1, 1536, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
        vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31

    ld.h          t7,       t8,       2
    bge           a3,       t7,       .DCT_DCT_EOB_32x32_END
    vst_x16 sp, 64+1024, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
        vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
    vst_x16 t1, 1024, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
        vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31

    ld.h          t7,       t8,       0
    bge           a3,       t7,       .DCT_DCT_EOB_32x32_END
    vst_x16 sp, 64+512, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
        vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31

    vst_x16 t1, 512, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
        vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31

.DCT_DCT_EOB_32x32_END:
    addi.d        t2,       sp,       64
    addi.d        t1,       sp,       64
.rept 4
    vld_x16 t2, 0, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15

    inv_dct16_lsx .8h

    vst_x16 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15

    vld_x16 t2, 64, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
            vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30

    dct_8x32_core_lsx t1, t3, 0, 128, 16, 0, 1536, 512, 1024, 64, , 4

    addi.d        t2,       t2,       16
    addi.d        t1,       t1,       16
.endr

    addi.d        t0,       sp,       64
.rept 16
    add.d         t2,       a0,       a1
    vld           vr10,     a0,       0
    vld           vr11,     a0,       16
    vld           vr12,     t2,       0
    vld           vr13,     t2,       16
    vld_x8 t0, 0, 16, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
    DST_ADD_W32 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
    alsl.d        a0,       a1,       a0,     1
    addi.d        t0,       t0,       128
.endr

    free_space 2560                                // 32*32*2+512
.DCT_DCT_32X32_END:
endfunc

/*
 * temp: vr8, vr9, vr10, vr12, vr20, vr21, vr22, vr23
 */
.macro dct_8x8_tx64_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, out0, \
                             out1, out2, out3, out4, out5, out6, out7, rect2

    la.local      t0,       idct_coeffs

.ifc \rect2, rect2_lsx
    vldrepl.w     vr23,      t0,       0        // 2896
.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7
    rect2_lsx \i, vr23, \i
.endr
.endif

    la.local      t0,       idct_coeffs

    vldrepl.w     vr20,     t0,       8            // 1567
    vldrepl.w     vr21,     t0,       12           // 3784
    vsllwil.w.h   vr22,     \in2,     0
    vexth.w.h     vr23,     \in2
    vmul.w        vr8,      vr22,     vr20
    vmul.w        vr10,     vr23,     vr20
    vmul.w        \in2,     vr22,     vr21
    vmul.w        vr9,      vr23,     vr21
    vssrarni.h.w  vr10,     vr8,      12           // t2
    vssrarni.h.w  vr9,      \in2,     12           // t3

    vldrepl.w     vr20,     t0,       0            // 2896
    vsllwil.w.h   vr22,     \in0,     0
    vexth.w.h     vr23,     \in0
    vmul.w        vr8,      vr22,     vr20
    vmul.w        \in2,     vr23,     vr20
    vssrarni.h.w  \in2,     vr8,      12

    vsadd.h       vr8,      \in2,     vr9          // c[0]
    vssub.h       vr9,      \in2,     vr9          // c[3]
    vsadd.h       \in0,     \in2,     vr10         // c[1]
    vssub.h       vr10,     \in2,     vr10         // c[2]

    // inv_dct8_1d_internal_c tx64
    // in1 in3
    vldrepl.w     vr20,     t0,       16           // 799
    vldrepl.w     vr21,     t0,       20           // 4017

    vsllwil.w.h   vr22,     \in1,     0
    vexth.w.h     vr23,     \in1
    vmul.w        \in2,     vr22,     vr21
    vmul.w        \in4,     vr23,     vr21
    vmul.w        \in1,     vr22,     vr20
    vmul.w        \in6,     vr23,     vr20
    vssrarni.h.w  \in4,     \in2,     12           // t7a
    vssrarni.h.w  \in6,     \in1,     12           // t4a

    vldrepl.w     vr20,     t0,       24           // 3406
    vldrepl.w     vr21,     t0,       28           // 2276

    vsllwil.w.h   vr22,     \in3,     0
    vexth.w.h     vr23,     \in3
    vneg.w        vr21,     vr21
    vmul.w        \in2,     vr22,     vr20
    vmul.w        \in1,     vr23,     vr20
    vmul.w        \in3,     vr22,     vr21
    vmul.w        \in7,     vr23,     vr21
    vssrarni.h.w  \in1,     \in2,     12           // t6a
    vssrarni.h.w  \in7,     \in3,     12           // t5a

    vsadd.h       \in3,     \in6,     \in7         // t4
    vssub.h       \in6,     \in6,     \in7         // t5a
    vsadd.h       \in5,     \in4,     \in1         // t7
    vssub.h       \in4,     \in4,     \in1         // t6a

    vldrepl.w     vr20,     t0,       0            // 2896
    vmul_vmadd_w  \in4, \in6, vr20, vr20, vr21, \in1
    vmul_vmsub_w  \in4, \in6, vr20, vr20, \in2, \in7
    vssrarni.h.w  \in1,     vr21,     12           // t6
    vssrarni.h.w  \in7,     \in2,     12           // t5

    vsadd.h       \out0,    vr8,      \in5         // c[0]
    vssub.h       \out7,    vr8,      \in5         // c[7]
    vsadd.h       \out1,    \in0,     \in1         // c[1]
    vssub.h       \out6,    \in0,     \in1         // c[6]
    vsadd.h       \out2,    vr10,     \in7         // c[2]
    vssub.h       \out5,    vr10,     \in7         // c[5]
    vsadd.h       \out3,    vr9,      \in3         // c[3]
    vssub.h       \out4,    vr9,      \in3         // c[4]
.endm

/*
 * input:  in0,  in1,  in2,  in3,  in4,  in5,  in6,  in7       (fixed)
 *         vr0,  vr1,  vr2,  vr3,  vr4,  vr5,  vr6,  vr7
 *         in8,  in9,  in10, in11, in12, in13, in14, in15
 *         vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
 * output: out0, out1, out2, out3, out4, out5, out6, out7      (fixed)
 *         vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16
 *         out8, out9, out10, out11, out12, out13, out14, out15
 *         vr27, vr30, vr23,  vr21,  vr29,  vr26,  vr25,  vr24
 */
.macro dct_8x16_tx64_core_lsx rect2
    dct_8x8_tx64_core_lsx vr0, vr2, vr4, vr6, vr19, vr25, vr27, vr29, vr11, \
                          vr12, vr13, vr14, vr15, vr16, vr17, vr18, \rect2

    // in1 in3 in5 in7 in9  in11 in13 in15
    // vr1 vr3 vr5 vr7 vr24 vr26 vr28 vr30
    la.local      t0,       idct_coeffs

.ifc \rect2, rect2_lsx
    vldrepl.w     vr23,      t0,       0        // 2896
.irp i, vr1, vr3, vr5, vr7, vr24, vr26, vr28, vr30
    rect2_lsx \i, vr23, \i
.endr
.endif

    vldrepl.w     vr20,     t0,       32           // 401
    vldrepl.w     vr21,     t0,       36           // 4076
    vsllwil.w.h   vr22,     vr1,      0
    vexth.w.h     vr23,     vr1
    vmul.w        vr0,      vr22,     vr21
    vmul.w        vr10,     vr23,     vr21
    vmul.w        vr1,      vr22,     vr20
    vmul.w        vr29,     vr23,     vr20
    vssrarni.h.w  vr10,     vr0,      12           // t15a
    vssrarni.h.w  vr29,     vr1,      12           // t8a

    vldrepl.w     vr20,     t0,       40           // 3166 -> 1583
    vldrepl.w     vr21,     t0,       44           // 2598 -> 1299
    vsllwil.w.h   vr22,     vr7,      0
    vexth.w.h     vr23,     vr7
    vneg.w        vr21,     vr21
    vmul.w        vr0,      vr22,     vr20
    vmul.w        vr30,     vr23,     vr20
    vmul.w        vr7,      vr22,     vr21
    vmul.w        vr31,     vr23,     vr21
    vssrarni.h.w  vr30,     vr0,      12           // t14a
    vssrarni.h.w  vr31,     vr7,      12           // t9a

    vldrepl.w     vr20,     t0,       48           // 1931
    vldrepl.w     vr21,     t0,       52           // 3612
    vsllwil.w.h   vr22,     vr5,      0
    vexth.w.h     vr23,     vr5
    vmul.w        vr0,      vr22,     vr21
    vmul.w        vr24,     vr23,     vr21
    vmul.w        vr5,      vr22,     vr20
    vmul.w        vr25,     vr23,     vr20
    vssrarni.h.w  vr24,     vr0,      12           // t13a
    vssrarni.h.w  vr25,     vr5,      12           // t10a

    vldrepl.w     vr20,     t0,       56           // 3920
    vldrepl.w     vr21,     t0,       60           // 1189
    vsllwil.w.h   vr22,     vr3,      0
    vexth.w.h     vr23,     vr3
    vneg.w        vr21,     vr21
    vmul.w        vr0,      vr22,     vr20
    vmul.w        vr26,     vr23,     vr20
    vmul.w        vr3,      vr22,     vr21
    vmul.w        vr27,     vr23,     vr21
    vssrarni.h.w  vr26,     vr0,      12           // t12a
    vssrarni.h.w  vr27,     vr3,      12           // t11a

    // vr22 vr23 vr30 vr31 vr24 vr25 vr26 vr27
    vsadd.h       vr28,     vr29,      vr31        // t8
    vssub.h       vr19,     vr29,      vr31        // t9
    vssub.h       vr29,     vr27,      vr25        // t10
    vsadd.h       vr9,      vr27,      vr25        // t11
    vsadd.h       vr31,     vr26,      vr24        // t12
    vssub.h       vr25,     vr26,      vr24        // t13
    vssub.h       vr27,     vr10,      vr30        // t14
    vsadd.h       vr24,     vr10,      vr30        // t15

    vldrepl.w     vr20,     t0,       8            // 1567
    vldrepl.w     vr21,     t0,       12           // 3784
    vmul_vmadd_w vr27, vr19, vr21, vr20, vr0, vr26
    vmul_vmsub_w vr27, vr19, vr20, vr21, vr1, vr30
    vssrarni.h.w  vr26,     vr0,       12          // t14a
    vssrarni.h.w  vr30,     vr1,       12          // t9a

    vmul_vmadd_w vr25, vr29, vr21, vr20, vr0, vr19
    vneg.w        vr0,      vr0
    vneg.w        vr19,     vr19
    vmul_vmsub_w vr25, vr29, vr20, vr21, vr1, vr27
    vssrarni.h.w  vr19,     vr0,       12          // t10a
    vssrarni.h.w  vr27,     vr1,       12          // t13a

    vsadd.h       vr25,     vr28,     vr9          // t8a
    vssub.h       vr29,     vr28,     vr9          // t11a
    vssub.h       vr28,     vr24,     vr31         // t12a
    vsadd.h       vr10,     vr24,     vr31         // t15a
    vsadd.h       vr9,      vr30,     vr19         // t9
    vssub.h       vr31,     vr30,     vr19         // t10
    vssub.h       vr30,     vr26,     vr27         // t13
    vsadd.h       vr24,     vr26,     vr27         // t14

    vldrepl.w     vr20,     t0,       0            // 2896
    vmul_vmadd_w vr30, vr31, vr20, vr20, vr0, vr26
    vmul_vmsub_w vr30, vr31, vr20, vr20, vr1, vr27
    vssrarni.h.w  vr26,     vr0,      12           // t13a
    vssrarni.h.w  vr27,     vr1,      12           // t10a

    vmul_vmadd_w vr28, vr29, vr20, vr20, vr0, vr31
    vmul_vmsub_w vr28, vr29, vr20, vr20, vr1, vr30
    vssrarni.h.w  vr31,     vr0,      12           // t12
    vssrarni.h.w  vr30,     vr1,      12           // t11

    // vr11 vr12 ... vr18
    vsadd.h       vr28,     vr14,     vr31         // c[3]
    vssub.h       vr29,     vr14,     vr31         // c[12]
    vsadd.h       vr20,     vr15,     vr30         // c[4]
    vssub.h       vr21,     vr15,     vr30         // c[11]
    vsadd.h       vr14,     vr16,     vr27         // c[5]
    vssub.h       vr23,     vr16,     vr27         // c[10]
    vsadd.h       vr15,     vr17,     vr9          // c[6]
    vssub.h       vr30,     vr17,     vr9          // c[9]
    vsadd.h       vr16,     vr18,     vr25         // c[7]
    vssub.h       vr27,     vr18,     vr25         // c[8]
    vsadd.h       vr17,     vr13,     vr26         // c[2]
    vssub.h       vr26,     vr13,     vr26         // c[13]
    vsadd.h       vr18,     vr12,     vr24         // c[1]
    vssub.h       vr25,     vr12,     vr24         // c[14]
    vsadd.h       vr22,     vr11,     vr10         // c[0]
    vssub.h       vr24,     vr11,     vr10         // c[15]
.endm // dct_8x16_tx64_core_lsx

.macro vmul_vssrarni_hw in0, in1, in2, tmp0, tmp1, out0, out1
    vsllwil.w.h   vr22,      \in0,     0
    vexth.w.h     vr23,      \in0
    vmul.w        \tmp0,     vr22,     \in1
    vmul.w        \out0,     vr23,     \in1
    vmul.w        \tmp1,     vr22,     \in2
    vmul.w        \out1,     vr23,     \in2
    vssrarni.h.w  \out0,     \tmp0,    12
    vssrarni.h.w  \out1,     \tmp1,    12
.endm

const idct64_coeffs, align=4
    .word         101, 4095, 2967, -2824
    .word         1660, 3745, 3822, -1474
    .word         4076, 401, 4017, 799
    .word         4036, -700, 2359, 3349
    .word         3461, -2191, 897, 3996
    .word         -3166, -2598, -799, -4017
    .word         501, 4065, 3229, -2520
    .word         2019, 3564, 3948, -1092
    .word         3612, 1931, 2276, 3406
    .word         4085, -301, 2675, 3102
    .word         3659, -1842, 1285, 3889
    .word         -3920, -1189, -3406, -2276
endconst

.macro dct64_step1_lsx
    vldrepl.w     vr20,     t0,       0            // 101
    vldrepl.w     vr21,     t0,       4            // 4095
    vmul_vssrarni_hw vr0, vr20, vr21, vr16, vr0, vr8, vr9    // vr8 t32a vr9 t63a
    vldrepl.w     vr20,     t0,       8            // 2967
    vldrepl.w     vr21,     t0,       12           // -2824
    vmul_vssrarni_hw vr1, vr20, vr21, vr16, vr1, vr10, vr11  // vr10 t62a vr11 t33a
    vldrepl.w     vr20,     t0,       16           // 1660
    vldrepl.w     vr21,     t0,       20           // 3745
    vmul_vssrarni_hw vr2, vr20, vr21, vr16, vr2, vr12, vr13  // vr12 t34a vr13 t61a
    vldrepl.w     vr20,     t0,       24           // 3822
    vldrepl.w     vr21,     t0,       28           // -1474
    vmul_vssrarni_hw vr3, vr20, vr21, vr16, vr3, vr14, vr15  // vr14 t60a vr15 t35a

    vsadd.h       vr0,      vr8,      vr11         // t32
    vssub.h       vr1,      vr8,      vr11         // t33
    vssub.h       vr2,      vr15,     vr12         // t34
    vsadd.h       vr3,      vr15,     vr12         // t35
    vsadd.h       vr4,      vr14,     vr13         // t60
    vssub.h       vr5,      vr14,     vr13         // t61
    vssub.h       vr6,      vr9,      vr10         // t62
    vsadd.h       vr7,      vr9,      vr10         // t63

    vldrepl.w     vr20,     t0,       32           // 4076
    vldrepl.w     vr21,     t0,       36           // 401
    vmul_vmadd_w vr6, vr1, vr20, vr21, vr9, vr10
    vmul_vmsub_w vr6, vr1, vr21, vr20, vr13, vr11
    vssrarni.h.w  vr10,     vr9,      12           // t62a
    vssrarni.h.w  vr11,     vr13,     12           // t33a

    vmul_vmadd_w vr5, vr2, vr20, vr21, vr9, vr1
    vmul_vmsub_w vr5, vr2, vr21, vr20, vr13, vr6
    vneg.w        vr9,      vr9
    vneg.w        vr1,      vr1
    vssrarni.h.w  vr6,      vr13,     12           // t61a
    vssrarni.h.w  vr1,      vr9,      12           // t34a

    vsadd.h       vr2,      vr0,      vr3          // t32a
    vssub.h       vr5,      vr0,      vr3          // t35a
    vsadd.h       vr9,      vr11,     vr1          // t33
    vssub.h       vr13,     vr11,     vr1          // t34
    vssub.h       vr0,      vr7,      vr4          // t60a
    vsadd.h       vr3,      vr7,      vr4          // t63a
    vssub.h       vr1,      vr10,     vr6          // t61
    vsadd.h       vr11,     vr10,     vr6          // t62

    vldrepl.w     vr20,     t0,       40           // 4017
    vldrepl.w     vr21,     t0,       44           // 799
    vmul_vmadd_w vr1, vr13, vr20, vr21, vr8, vr4
    vmul_vmsub_w vr1, vr13, vr21, vr20, vr12, vr7
    vssrarni.h.w  vr4,      vr8,      12           // t61a
    vssrarni.h.w  vr7,      vr12,     12           // t34a

    vmul_vmadd_w vr0, vr5, vr20, vr21, vr8, vr6
    vmul_vmsub_w vr0, vr5, vr21, vr20, vr12, vr10
    vssrarni.h.w  vr6,      vr8,      12           // t60
    vssrarni.h.w  vr10,     vr12,     12           // t35

    vst_x8 t6, 0, 16, vr2, vr9, vr7, vr10, vr6, vr4, vr11, vr3
.endm // dct64_step1

    // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
    // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
    // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
    // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
.macro dct64_step2_lsx
    vld           vr0,      t5,       0            // t32a
    vld           vr2,      t4,       0            // t63a
    vld           vr3,      t5,       16*8         // t56a
    vld           vr1,      t4,       16*8         // t39a
    vld           vr4,      t5,       16*16        // t40a
    vld           vr6,      t4,       16*16        // t55a
    vld           vr7,      t5,       16*24        // t48a
    vld           vr5,      t4,       16*24        // t47a

    vsadd.h       vr8,      vr0,      vr1          // t32
    vssub.h       vr9,      vr0,      vr1          // t39
    vsadd.h       vr10,     vr2,      vr3          // t63
    vssub.h       vr11,     vr2,      vr3          // t56
    vssub.h       vr12,     vr5,      vr4          // t40
    vsadd.h       vr13,     vr5,      vr4          // t47
    vsadd.h       vr14,     vr7,      vr6          // t48
    vssub.h       vr15,     vr7,      vr6          // t55
    vldrepl.w     vr20,     t0,       8            // 1567
    vldrepl.w     vr21,     t0,       12           // 3784
    vmul_vmadd_w  vr11, vr9, vr21, vr20, vr0, vr2
    vmul_vmsub_w  vr11, vr9, vr20, vr21, vr1, vr3
    vssrarni.h.w  vr2,      vr0,      12           // t56a
    vssrarni.h.w  vr3,      vr1,      12           // t39a
    vmul_vmadd_w  vr15, vr12, vr21, vr20, vr0, vr4
    vmul_vmsub_w  vr15, vr12, vr20, vr21, vr1, vr5
    vneg.w        vr0,      vr0
    vneg.w        vr4,      vr4
    vssrarni.h.w  vr5,      vr1,      12           // t55a
    vssrarni.h.w  vr4,      vr0,      12           // t40a
    vsadd.h       vr9,      vr8,      vr13         // t32a
    vssub.h       vr11,     vr8,      vr13         // t47a
    vsadd.h       vr6,      vr3,      vr4          // t39
    vssub.h       vr7,      vr3,      vr4          // t40
    vssub.h       vr12,     vr10,     vr14         // t48a
    vsadd.h       vr15,     vr10,     vr14         // t63a
    vssub.h       vr0,      vr2,      vr5          // t55
    vsadd.h       vr1,      vr2,      vr5          // t56

    vldrepl.w     vr20,     t0,       0            // 2896
    vmul_vmsub_w vr0, vr7, vr20, vr20, vr8, vr13
    vmul_vmadd_w vr0, vr7, vr20, vr20, vr3, vr4
    vssrarni.h.w  vr13,     vr8,      12           // t40a
    vssrarni.h.w  vr4,      vr3,      12           // t55a
    vmul_vmsub_w vr12, vr11, vr20, vr20, vr8, vr10
    vmul_vmadd_w vr12, vr11, vr20, vr20, vr3, vr14
    vssrarni.h.w  vr10,     vr8,      12           // t47
    vssrarni.h.w  vr14,     vr3,      12           // t48

    // t32a t39 t40a t47  t48  t55a t56 t63a
    // vr9  vr6 vr13 vr10 vr14 vr4  vr1 vr15
    vst           vr9,      t5,       0            // t32a
    vst           vr6,      t4,       0            // t39
    vst           vr13,     t5,       16*8         // t40a
    vst           vr10,     t4,       16*8         // t47
    vst           vr14,     t5,       16*16        // t48
    vst           vr4,      t4,       16*16        // t55a
    vst           vr1,      t5,       16*24        // t56
    vst           vr15,     t4,       16*24        // t63a
.endm // dct64_step2_lsx

.macro dct64_step3_lsx
    //                t0   t1   t2   t3   t4    t5    t6    t7
    vld_x8 t3, 0, 16, vr2, vr3, vr7, vr8, vr11, vr12, vr16, vr17
    vld           vr9,      t5,       16*24    // t56
    vld           vr6,      t5,       16*24+16 // t57a
    vld           vr13,     t5,       16*24+32 // t58
    vld           vr10,     t5,       16*24+48 // t59a
    vld           vr14,     t4,       16*24-48 // t60
    vld           vr4,      t4,       16*24-32 // t61a
    vld           vr1,      t4,       16*24-16 // t62
    vld           vr15,     t4,       16*24    // t63a
    vsadd.h       vr20,     vr2,      vr15     // c[0]
    vssub.h       vr21,     vr2,      vr15     // c[63]
    vsadd.h       vr22,     vr3,      vr1      // c[1]
    vssub.h       vr23,     vr3,      vr1      // c[62]
    vsadd.h       vr24,     vr7,      vr4      // c[2]
    vssub.h       vr25,     vr7,      vr4      // c[61]
    vsadd.h       vr26,     vr8,      vr14     // c[3]
    vssub.h       vr27,     vr8,      vr14     // c[60]
    vsadd.h       vr28,     vr11,     vr10     // c[4]
    vssub.h       vr29,     vr11,     vr10     // c[59]
    vsadd.h       vr30,     vr12,     vr13     // c[5]
    vssub.h       vr31,     vr12,     vr13     // c[58]
    vsadd.h       vr2,      vr16,     vr6      // c[6]
    vssub.h       vr15,     vr16,     vr6      // c[57]
    vsadd.h       vr1,      vr17,     vr9      // c[7]
    vssub.h       vr3,      vr17,     vr9      // c[56]
.endm // dct64_step3_lsx

.macro dct64_step4_lsx transpose8x8, shift, start0, stride0, start1, stride1
    dct64_step3_lsx

.ifnb \transpose8x8
    LSX_TRANSPOSE8x8_H vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1, \
                       vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1, \
                       vr4, vr7, vr8, vr14, vr10, vr11, vr12, vr13

    LSX_TRANSPOSE8x8_H vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21, \
                       vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21, \
                       vr4, vr7, vr8, vr14, vr10, vr11, vr12, vr13
.endif

.ifnb \shift
.irp i, vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1, \
     vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21
     vsrari.h     \i,       \i,       \shift
.endr
.endif

    vst_x8 t7, \start0, \stride0, vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1

    vst_x8 t7, \start1, \stride1, vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21
.endm // dct64_step4_lsx

.macro dct64_step5_lsx in0, in1, in2, in3, in4, in5, in6, in7
    fld.d         f4,       t0,       0
    fldx.d        f5,       t0,       a1
    fld.d         f6,       t6,       0
    fldx.d        f7,       t6,       a1
    alsl.d        t0,       a1,       t0,    2
    alsl.d        t6,       a1,       t6,    2
    fld.d         f8,       t0,       0
    fldx.d        f9,       t0,       a1
    fld.d         f10,      t6,       0
    fldx.d        f11,      t6,       a1
.irp i, vr4, vr5, vr6, vr7, vr8, vr9, vr10, vr11
    vsllwil.hu.bu   \i,      \i,       0
.endr
    vsrari.h      vr20,     \in0,     4
    vsrari.h      vr22,     \in1,     4
    vsrari.h      vr24,     \in2,     4
    vsrari.h      vr26,     \in3,     4
    vsrari.h      vr28,     \in4,     4
    vsrari.h      vr30,     \in5,     4
    vsrari.h      vr2,      \in6,     4
    vsrari.h      vr1,      \in7,     4
    vadd.h        vr4,      vr4,      vr20
    vadd.h        vr5,      vr5,      vr22
    vadd.h        vr6,      vr6,      vr24
    vadd.h        vr7,      vr7,      vr26
    vadd.h        vr8,      vr8,      vr28
    vadd.h        vr9,      vr9,      vr30
    vadd.h        vr10,     vr10,     vr2
    vadd.h        vr11,     vr11,     vr1
    vssrani.bu.h  vr5,      vr4,      0
    vssrani.bu.h  vr7,      vr6,      0
    vssrani.bu.h  vr9,      vr8,      0
    vssrani.bu.h  vr11,     vr10,     0

    vstelm.d      vr5,      t1,       0,     0
    vstelm.d      vr5,      t2,       0,     1
    alsl.d        t1,       a1,       t1,    1
    alsl.d        t2,       a1,       t2,    1
    vstelm.d      vr7,      t1,       0,     0
    vstelm.d      vr7,      t2,       0,     1
    alsl.d        t1,       a1,       t1,    1
    alsl.d        t2,       a1,       t2,    1
    vstelm.d      vr9,      t1,       0,     0
    vstelm.d      vr9,      t2,       0,     1
    alsl.d        t1,       a1,       t1,    1
    alsl.d        t2,       a1,       t2,    1
    vstelm.d      vr11,     t1,       0,     0
    vstelm.d      vr11,     t2,       0,     1
.endm // dct64_step5_lsx

.macro dct_8x32_tx64_new_lsx vld_loc0, stride0, vld_loc1, stride1, rect2
    vld_x8 t2, \vld_loc0, \stride0, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7

    dct_8x16_tx64_core_lsx \rect2

    vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
            vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24

    vxor.v        vr31,     vr31,     vr31
    vst_x8 t2, \vld_loc0, \stride0, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31

    vld_x8 t2, \vld_loc1, \stride1, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7

    vst_x8 t2, \vld_loc1, \stride1, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31

    la.local      t0,       idct_coeffs

.ifc \rect2, rect2_lsx
    vldrepl.w     vr23,      t0,       0        // 2896
.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
    rect2_lsx \i, vr23, \i
.endr
.endif

    vldrepl.w     vr20,     t0,       64           // 201
    vldrepl.w     vr21,     t0,       68           // 4091
    vsllwil.w.h   vr22,     vr0,      0
    vexth.w.h     vr23,     vr0
    vmul.w        vr8,      vr22,     vr21
    vmul.w        vr9,      vr23,     vr21
    vmul.w        vr0,      vr22,     vr20
    vmul.w        vr10,     vr23,     vr20
    vssrarni.h.w  vr9,      vr8,      12           // t31a
    vssrarni.h.w  vr10,     vr0,      12           // t16a

    vldrepl.w     vr20,     t0,       72           // 3035
    vldrepl.w     vr21,     t0,       76           // 2751
    vsllwil.w.h   vr22,     vr7,      0
    vexth.w.h     vr23,     vr7
    vneg.w        vr21,     vr21
    vmul.w        vr8,      vr22,     vr20
    vmul.w        vr0,      vr23,     vr20
    vmul.w        vr7,      vr22,     vr21
    vmul.w        vr30,     vr23,     vr21
    vssrarni.h.w  vr0,      vr8,      12           // t30a
    vssrarni.h.w  vr30,     vr7,      12           // t17a

    vldrepl.w     vr20,     t0,       80           // 1751
    vldrepl.w     vr21,     t0,       84           // 3703
    vsllwil.w.h   vr22,     vr4,      0
    vexth.w.h     vr23,     vr4
    vmul.w        vr8,      vr22,     vr21
    vmul.w        vr7,      vr23,     vr21
    vmul.w        vr4,      vr22,     vr20
    vmul.w        vr19,     vr23,     vr20
    vssrarni.h.w  vr7,      vr8,      12           // t29a
    vssrarni.h.w  vr19,     vr4,      12           // t18a

    vldrepl.w     vr20,     t0,       88           // 3857
    vldrepl.w     vr21,     t0,       92           // 1380
    vsllwil.w.h   vr22,     vr3,      0
    vexth.w.h     vr23,     vr3
    vneg.w        vr21,     vr21
    vmul.w        vr8,      vr22,     vr20
    vmul.w        vr4,      vr23,     vr20
    vmul.w        vr3,      vr22,     vr21
    vmul.w        vr26,     vr23,     vr21
    vssrarni.h.w  vr4,      vr8,      12           // t28a
    vssrarni.h.w  vr26,     vr3,      12           // t19a

    vldrepl.w     vr20,     t0,       96           // 995
    vldrepl.w     vr21,     t0,       100          // 3973
    vsllwil.w.h   vr22,     vr2,      0
    vexth.w.h     vr23,     vr2
    vmul.w        vr8,      vr22,     vr21
    vmul.w        vr3,      vr23,     vr21
    vmul.w        vr2,      vr22,     vr20
    vmul.w        vr27,     vr23,     vr20
    vssrarni.h.w  vr3,      vr8,      12           // t27a
    vssrarni.h.w  vr27,     vr2,      12           // t20a

    vldrepl.w     vr20,     t0,       104          // 3513
    vldrepl.w     vr21,     t0,       108          // 2106
    vsllwil.w.h   vr22,     vr5,      0
    vexth.w.h     vr23,     vr5
    vneg.w        vr21,     vr21
    vmul.w        vr8,      vr22,     vr20
    vmul.w        vr2,      vr23,     vr20
    vmul.w        vr5,      vr22,     vr21
    vmul.w        vr28,     vr23,     vr21
    vssrarni.h.w  vr2,      vr8,      12           // t26a
    vssrarni.h.w  vr28,     vr5,      12           // t21a

    vldrepl.w     vr20,     t0,       112          // 2440 -> 1220
    vldrepl.w     vr21,     t0,       116          // 3290 -> 1645
    vsllwil.w.h   vr22,     vr6,      0
    vexth.w.h     vr23,     vr6
    vmul.w        vr8,      vr22,     vr21
    vmul.w        vr5,      vr23,     vr21
    vmul.w        vr6,      vr22,     vr20
    vmul.w        vr25,     vr23,     vr20
    vssrarni.h.w  vr5,      vr8,      12           // t25a
    vssrarni.h.w  vr25,     vr6,      12           // t22a

    vldrepl.w     vr20,     t0,       120          // 4052
    vldrepl.w     vr21,     t0,       124          // 601
    vsllwil.w.h   vr22,     vr1,      0
    vexth.w.h     vr23,     vr1
    vneg.w        vr21,     vr21
    vmul.w        vr8,      vr22,     vr20
    vmul.w        vr6,      vr23,     vr20
    vmul.w        vr1,      vr22,     vr21
    vmul.w        vr24,     vr23,     vr21
    vssrarni.h.w  vr6,      vr8,      12           // t24a
    vssrarni.h.w  vr24,     vr1,      12           // t23a

    vsadd.h       vr1,      vr10,     vr30         // t16
    vssub.h       vr29,     vr10,     vr30         // t17
    vssub.h       vr8,      vr26,     vr19         // t18
    vsadd.h       vr31,     vr26,     vr19         // t19
    vsadd.h       vr10,     vr27,     vr28         // t20
    vssub.h       vr30,     vr27,     vr28         // t21
    vssub.h       vr19,     vr24,     vr25         // t22
    vsadd.h       vr26,     vr24,     vr25         // t23
    vsadd.h       vr27,     vr6,      vr5          // t24
    vssub.h       vr28,     vr6,      vr5          // t25
    vssub.h       vr24,     vr3,      vr2          // t26
    vsadd.h       vr25,     vr3,      vr2          // t27
    vsadd.h       vr5,      vr4,      vr7          // t28
    vssub.h       vr6,      vr4,      vr7          // t29
    vssub.h       vr2,      vr9,      vr0          // t30
    vsadd.h       vr3,      vr9,      vr0          // t31

    vldrepl.w     vr20,     t0,       16           // 799
    vldrepl.w     vr21,     t0,       20           // 4017
    vmul_vmadd_w vr2, vr29, vr21, vr20, vr4, vr7
    vmul_vmsub_w vr2, vr29, vr20, vr21, vr11, vr0
    vssrarni.h.w  vr7,      vr4,      12           // t30a
    vssrarni.h.w  vr0,      vr11,     12           // t17a
    vmul_vmadd_w vr6, vr8, vr21, vr20, vr4, vr9
    vneg.w        vr4,      vr4
    vneg.w        vr9,      vr9
    vmul_vmsub_w vr6, vr8, vr20, vr21, vr11, vr2
    vssrarni.h.w  vr9,      vr4,      12           // t18a
    vssrarni.h.w  vr2,      vr11,     12           // t29a

    vldrepl.w     vr20,     t0,       24           // 3406 -> 1703
    vldrepl.w     vr21,     t0,       28           // 2276 -> 1138
    vmul_vmadd_w vr24, vr30, vr21, vr20, vr4, vr29
    vmul_vmsub_w vr24, vr30, vr20, vr21, vr11, vr6
    vssrarni.h.w  vr29,     vr4,      12           // t26a
    vssrarni.h.w  vr6,      vr11,     12           // t21a

    vmul_vmadd_w vr28, vr19, vr21, vr20, vr4, vr8
    vneg.w        vr4,      vr4
    vneg.w        vr8,      vr8
    vmul_vmsub_w vr28, vr19, vr20, vr21, vr11, vr24
    vssrarni.h.w  vr8,      vr4,      12           // t22a
    vssrarni.h.w  vr24,     vr11,     12           // t25a

    vsadd.h       vr4,      vr1,      vr31         // t16a
    vssub.h       vr30,     vr1,      vr31         // t19a
    vsadd.h       vr19,     vr0,      vr9          // t17
    vssub.h       vr28,     vr0,      vr9          // t18
    vssub.h       vr1,      vr26,     vr10         // t20a
    vsadd.h       vr31,     vr26,     vr10         // t23a
    vssub.h       vr0,      vr8,      vr6          // t21
    vsadd.h       vr9,      vr8,      vr6          // t22
    vsadd.h       vr10,     vr27,     vr25         // t24a
    vssub.h       vr26,     vr27,     vr25         // t27a
    vsadd.h       vr6,      vr24,     vr29         // t25
    vssub.h       vr8,      vr24,     vr29         // t26
    vssub.h       vr25,     vr3,      vr5          // t28a
    vsadd.h       vr27,     vr3,      vr5          // t31a
    vssub.h       vr24,     vr7,      vr2          // t29
    vsadd.h       vr29,     vr7,      vr2          // t30

    vldrepl.w     vr20,     t0,       8            // 1567
    vldrepl.w     vr21,     t0,       12           // 3784
    vmul_vmadd_w vr24, vr28, vr21, vr20, vr3, vr5
    vmul_vmsub_w vr24, vr28, vr20, vr21, vr11, vr2
    vssrarni.h.w  vr5,      vr3,      12           // t29a
    vssrarni.h.w  vr2,      vr11,     12           // 18a

    vmul_vmadd_w vr25, vr30, vr21, vr20, vr3, vr7
    vmul_vmsub_w vr25, vr30, vr20, vr21, vr11, vr24
    vssrarni.h.w  vr7,      vr3,      12           // t28
    vssrarni.h.w  vr24,     vr11,     12           // t19

    vmul_vmadd_w vr26, vr1, vr21, vr20, vr3, vr28
    vneg.w        vr3,      vr3
    vneg.w        vr28,     vr28
    vmul_vmsub_w vr26, vr1, vr20, vr21, vr11, vr25
    vssrarni.h.w  vr28,     vr3,      12           // t20
    vssrarni.h.w  vr25,     vr11,     12           // t27

    vmul_vmadd_w vr8, vr0, vr21, vr20, vr3, vr30
    vneg.w        vr3,      vr3
    vneg.w        vr30,     vr30
    vmul_vmsub_w vr8, vr0, vr20, vr21, vr11, vr1
    vssrarni.h.w  vr30,     vr3,      12           // t21a
    vssrarni.h.w  vr1,      vr11,     12           // t26a

    vsadd.h       vr3,      vr4,      vr31         // t16
    vssub.h       vr26,     vr4,      vr31         // t23
    vsadd.h       vr0,      vr19,     vr9          // t17a
    vssub.h       vr8,      vr19,     vr9          // t22a
    vsadd.h       vr4,      vr2,      vr30         // t18
    vssub.h       vr31,     vr2,      vr30         // t21
    vsadd.h       vr9,      vr24,     vr28         // t19a
    vssub.h       vr19,     vr24,     vr28         // t20a
    vssub.h       vr2,      vr27,     vr10         // t24
    vsadd.h       vr30,     vr27,     vr10         // t31
    vssub.h       vr24,     vr29,     vr6          // t25a
    vsadd.h       vr28,     vr29,     vr6          // t30a
    vssub.h       vr10,     vr5,      vr1          // t26
    vsadd.h       vr27,     vr5,      vr1          // t29
    vssub.h       vr6,      vr7,      vr25         // t27a
    vsadd.h       vr29,     vr7,      vr25         // t28a

    vldrepl.w     vr20,     t0,       0            // 2896
    vmul_vmsub_w vr6, vr19, vr20, vr20, vr1, vr5
    vmul_vmadd_w vr6, vr19, vr20, vr20, vr11, vr7
    vssrarni.h.w  vr5,      vr1,      12           // t20
    vssrarni.h.w  vr7,      vr11,     12           // t27

    vmul_vmsub_w vr10, vr31, vr20, vr20, vr1, vr25
    vmul_vmadd_w vr10, vr31, vr20, vr20, vr11, vr6
    vssrarni.h.w  vr25,     vr1,      12           // t21a
    vssrarni.h.w  vr6,      vr11,     12           // t26a

    vmul_vmsub_w vr24, vr8, vr20, vr20, vr1, vr19
    vmul_vmadd_w vr24, vr8, vr20, vr20, vr11, vr10
    vssrarni.h.w  vr19,     vr1,      12           // t22
    vssrarni.h.w  vr10,     vr11,     12           // t25

    vmul_vmsub_w vr2, vr26, vr20, vr20, vr1, vr31
    vmul_vmadd_w vr2, vr26, vr20, vr20, vr11, vr8
    vssrarni.h.w  vr31,     vr1,      12           // t23a
    vssrarni.h.w  vr8,      vr11,     12           // t24a

    // t31 t30a t29 t28a t27 t26a t25 t24a t23a t22 t21a t20 t19a t18 t17a t16
    // vr30 vr28 vr27 vr29 vr7 vr6 vr10 vr8 vr31 vr19 vr25 vr5 vr9 vr4 vr0 vr3
    vld_x8 t3, 0, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18

    vsadd.h       vr1,      vr11,     vr30         // c[0]
    vssub.h       vr2,      vr11,     vr30         // c[31]
    vsadd.h       vr24,     vr12,     vr28         // c[1]
    vssub.h       vr26,     vr12,     vr28         // c[30]
    vsadd.h       vr11,     vr13,     vr27         // c[2]
    vssub.h       vr30,     vr13,     vr27         // c[29]
    vsadd.h       vr12,     vr14,     vr29         // c[3]
    vssub.h       vr28,     vr14,     vr29         // c[28]
    vsadd.h       vr13,     vr15,     vr7          // c[4]
    vssub.h       vr27,     vr15,     vr7          // c[27]
    vsadd.h       vr14,     vr16,     vr6          // c[5]
    vssub.h       vr29,     vr16,     vr6          // c[26]
    vsadd.h       vr7,      vr17,     vr10         // c[6]
    vssub.h       vr15,     vr17,     vr10         // c[25]
    vsadd.h       vr6,      vr18,     vr8          // c[7]
    vssub.h       vr16,     vr18,     vr8          // c[24]

    vst_x8 t3, 0, 16, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6

    vst_x8 t3, 384, 16, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2

    vld_x8 t3, 128, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18

    vsadd.h       vr1,      vr11,     vr31         // c[8]
    vssub.h       vr2,      vr11,     vr31         // c[23]
    vsadd.h       vr24,     vr12,     vr19         // c[9]
    vssub.h       vr26,     vr12,     vr19         // c[22]
    vsadd.h       vr11,     vr13,     vr25         // c[10]
    vssub.h       vr30,     vr13,     vr25         // c[21]
    vsadd.h       vr12,     vr14,     vr5          // c[11]
    vssub.h       vr28,     vr14,     vr5          // c[20]
    vsadd.h       vr13,     vr15,     vr9          // c[12]
    vssub.h       vr27,     vr15,     vr9          // c[19]
    vsadd.h       vr14,     vr16,     vr4          // c[13]
    vssub.h       vr29,     vr16,     vr4          // c[18]
    vsadd.h       vr7,      vr17,     vr0          // c[14]
    vssub.h       vr15,     vr17,     vr0          // c[17]
    vsadd.h       vr6,      vr18,     vr3          // c[15]
    vssub.h       vr16,     vr18,     vr3          // c[16]

    vst_x8 t3, 128, 16, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6

    vst_x8 t3, 256, 16, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
.endm // dct_8x32_tx64_new_lsx

.macro DST_ADD_W64 in0, in1, in2, in3, in4, in5, in6, in7
    vsllwil.hu.bu vr4,      vr10,     0
    vsllwil.hu.bu vr5,      vr11,     0
    vsllwil.hu.bu vr6,      vr12,     0
    vsllwil.hu.bu vr7,      vr13,     0
    vexth.hu.bu   vr10,     vr10
    vexth.hu.bu   vr11,     vr11
    vexth.hu.bu   vr12,     vr12
    vexth.hu.bu   vr13,     vr13
    vadd.h        vr4,      vr4,      \in0
    vadd.h        vr10,     vr10,     \in1
    vadd.h        vr5,      vr5,      \in2
    vadd.h        vr11,     vr11,     \in3
    vadd.h        vr6,      vr6,      \in4
    vadd.h        vr12,     vr12,     \in5
    vadd.h        vr7,      vr7,      \in6
    vadd.h        vr13,     vr13,     \in7
    vssrani.bu.h  vr10,     vr4,      0
    vssrani.bu.h  vr11,     vr5,      0
    vssrani.bu.h  vr12,     vr6,      0
    vssrani.bu.h  vr13,     vr7,      0
    vst           vr10,     a0,       0
    vst           vr11,     a0,       16
    vst           vr12,     a0,       32
    vst           vr13,     a0,       48
.endm

.macro idct_dc_w64 w, h, shift
    ld.h          t2,       a2,       0
    vldi          vr0,      0x8b5
    vreplgr2vr.w  vr1,      t2
    vldi          vr20,     0x880
    vmul.w        vr2,      vr0,      vr1
    st.h          zero,     a2,       0
    vsrari.w      vr2,      vr2,      8
    vld           vr13,     a0,       48

.if (2*\w == \h) || (2*\h == \w)
    vmul.w        vr2,      vr2,      vr0
    vsrari.w      vr2,      vr2,      8
.endif

.if \shift>0
    vsrari.w      vr2,      vr2,      \shift
.endif
    vld           vr11,     a0,       16
    vmadd.w       vr20,     vr2,      vr0
    vld           vr12,     a0,       32
    vssrarni.h.w  vr20,     vr20,     12
    vld           vr10,     a0,       0
.endm

function inv_txfm_add_dct_dct_64x64_8bpc_lsx
    bnez          a3,       .NO_HAS_DCONLY_64x64

    idct_dc_w64 64, 64, 2

    DST_ADD_W64 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20

    li.w          t3,       63
.loop63:
    add.d         a0,       a0,       a1
    vld           vr10,     a0,       0
    vld           vr11,     a0,       16
    vld           vr12,     a0,       32
    vld           vr13,     a0,       48
    DST_ADD_W64 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20
    addi.d        t3,       t3,       -1
    blt           zero,     t3,       .loop63
    b             .DCT_DCT_64X64_END
.NO_HAS_DCONLY_64x64:

    malloc_space  64*32*2+512+512

.macro dct64x64_core1_lsx shift, rect2
    //addi.d        t2,       a2,       \in0
    //addi.d        t7,       t7,       \in1
    li.w          t4,       64*32*2+64
    add.d         t3,       sp,       t4
    addi.d        t6,       t3,       512
    add.d         t5,       t6,       zero

    dct_8x32_tx64_new_lsx 0, 256, 128, 256, \rect2

    la.local      t0,       idct64_coeffs
    vxor.v        vr31,     vr31,     vr31

    //addi.d        a4,       a2,       \in2         // 32 ...
    // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
    vld           vr0,      a4,       128*0        // in1
    vld           vr1,      a4,       128*15       // in31
    vld           vr2,      a4,       128*8        // in17
    vld           vr3,      a4,       128*7        // in15
    la.local      a6,       idct_coeffs
.ifc \rect2, rect2_lsx
    vldrepl.w     vr23,      a6,       0        // 2896
.irp i, vr0, vr1, vr2, vr3
    rect2_lsx \i, vr23, \i
.endr
.endif
    vst           vr31,     a4,       128*0
    vst           vr31,     a4,       128*15
    vst           vr31,     a4,       128*8
    vst           vr31,     a4,       128*7
    dct64_step1_lsx

    addi.d        t0,       t0,       48
    addi.d        t6,       t6,       128
    // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
    vld           vr0,      a4,       128*3        // in7
    vld           vr1,      a4,       128*12       // in25
    vld           vr2,      a4,       128*11       // in23
    vld           vr3,      a4,       128*4        // in9
    la.local      a6,       idct_coeffs
.ifc \rect2, rect2_lsx
    vldrepl.w     vr23,      a6,       0        // 2896
.irp i, vr0, vr1, vr2, vr3
    rect2_lsx \i, vr23, \i
.endr
.endif
    vst           vr31,     a4,       128*3
    vst           vr31,     a4,       128*12
    vst           vr31,     a4,       128*11
    vst           vr31,     a4,       128*4
    dct64_step1_lsx

    addi.d        t0,       t0,       48
    addi.d        t6,       t6,       128
    // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
    vld           vr0,      a4,       128*2        // in5
    vld           vr1,      a4,       128*13       // in27
    vld           vr2,      a4,       128*10       // in21
    vld           vr3,      a4,       128*5        // in11
    la.local      a6,       idct_coeffs
.ifc \rect2, rect2_lsx
    vldrepl.w     vr23,      a6,      0        // 2896
.irp i, vr0, vr1, vr2, vr3
    rect2_lsx \i, vr23, \i
.endr
.endif
    vst           vr31,     a4,       128*2
    vst           vr31,     a4,       128*13
    vst           vr31,     a4,       128*10
    vst           vr31,     a4,       128*5
    dct64_step1_lsx

    addi.d        t0,       t0,       48
    addi.d        t6,       t6,       128
    // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
    vld           vr0,      a4,       128*1        // in3
    vld           vr1,      a4,       128*14       // in29
    vld           vr2,      a4,       128*9        // in19
    vld           vr3,      a4,       128*6        // in13
    la.local      a6,       idct_coeffs
.ifc \rect2, rect2_lsx
    vldrepl.w     vr23,      a6,       0        // 2896
.irp i, vr0, vr1, vr2, vr3
    rect2_lsx \i, vr23, \i
.endr
.endif
    vst           vr31,     a4,       128*1
    vst           vr31,     a4,       128*14
    vst           vr31,     a4,       128*9
    vst           vr31,     a4,       128*6
    dct64_step1_lsx

    la.local      t0,       idct_coeffs
    addi.d        t4,       t5,       16*7
    // t32a/t39/t40a/t47/t48/t55a/t56/t63a
    dct64_step2_lsx

    addi.d        t5,       t5,       16
    addi.d        t4,       t4,       -16
    // t33/t38a/t41/t46a/t49a/t54/t57a/t62
    dct64_step2_lsx

    addi.d        t5,       t5,       16
    addi.d        t4,       t4,       -16
    // t34a/t37/t42a/t45/t50/t53a/t58/t61a
    dct64_step2_lsx

    addi.d        t5,       t5,       16
    addi.d        t4,       t4,       -16
    // t35/t36a/t43/t44a/t51a/t52/t59a/t60
    dct64_step2_lsx

    li.w          t4,       64*32*2+64+512
    add.d         t5,       t4,       sp
    addi.d        t4,       t5,       16*7
    dct64_step4_lsx transpose8x8, \shift, 0, 128, 112, 128

    addi.d        t3,       t3,       128
    addi.d        t4,       t4,       -16*8
    addi.d        t5,       t5,       -16*8
    dct64_step4_lsx transpose8x8, \shift, 16, 128, 96, 128

    addi.d        t5,       t5,       -16*8
    addi.d        t4,       t4,       -16*8
    addi.d        t3,       t3,       128
    dct64_step4_lsx transpose8x8, \shift, 32, 128, 80, 128

    addi.d        t5,       t5,       -16*8
    addi.d        t4,       t4,       -16*8
    addi.d        t3,       t3,       128
    dct64_step4_lsx transpose8x8, \shift, 48, 128, 64, 128
.endm
    la.local      t8,       eob_32x32
    addi.d        t2,       a2,       0
    addi.d        t7,       sp,       64
    addi.d        t7,       t7,       0
    addi.d        a4,       a2,       64
.DCT_DCT_EOB_64x64:
    ld.h          a5,       t8,       0
    addi.d        t8,       t8,       2
    dct64x64_core1_lsx 2, no_rect2
    addi.d        t2,       t2,       16
    addi.d        t7,       t7,       128*8
    addi.d        a4,       a4,       16
    bge           a3,       a5,       .DCT_DCT_EOB_64x64

    la.local      t8,       eob_32x32
    vxor.v        vr31,     vr31,     vr31

    ld.h          t7,       t8,       4
    bge           a3,       t7,       .DCT_DCT_EOB_64x64_END
    li.d          t1,       1024*3+64
    add.d         t0,       sp,       t1
.rept 4
    vst_x16 t0, 0, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
            vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
    addi.d t0, t0, 256
.endr

    ld.h          t7,       t8,       2
    bge           a3,       t7,       .DCT_DCT_EOB_64x64_END
    li.d          t1,       1024*2+64
    add.d         t0,       sp,       t1
.rept 4
    vst_x16 t0, 0, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
            vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
    addi.d        t0,       t0,       256
.endr
    ld.h          t7,       t8,       0
    bge           a3,       t7,       .DCT_DCT_EOB_64x64_END

    li.d          t1,       1024*1+64
    add.d         t0,       sp,       t1
.rept 4
    vst_x16 t0, 0, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
            vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
    addi.d        t0,       t0,       256
.endr

.DCT_DCT_EOB_64x64_END:

.macro dct64x64_core2_lsx in0, in1, rect2
    addi.d        t2,       sp,       64+\in0
    addi.d        t7,       sp,       64+\in0
    li.w          t4,       64*32*2+64
    add.d         t3,       sp,       t4
    addi.d        t6,       t3,       512
    add.d         t5,       t6,       zero

    addi.d        t2,       t2,       1024
    addi.d        t2,       t2,       1024
    dct_8x32_tx64_new_lsx -2048, 512, 256-2048, 512, \rect2

    la.local      t0,       idct64_coeffs
    addi.d        t2,       sp,       64+64*2+\in0
    addi.d        t4,       t2,       256*7
    addi.d        t4,       t4,       256

    vld           vr0,      t2,       256*0        // in1
    vld           vr1,      t4,       256*7        // in31
    vld           vr2,      t4,       256*0        // in17
    vld           vr3,      t2,       256*7        // in15
    dct64_step1_lsx

    addi.d        t0,       t0,       48
    addi.d        t6,       t6,       128
    vld           vr0,      t2,       256*3        // in7
    vld           vr1,      t4,       256*4        // in25
    vld           vr2,      t4,       256*3        // in23
    vld           vr3,      t2,       256*4        // in9
    dct64_step1_lsx

    addi.d        t0,        t0,       48
    addi.d        t6,        t6,       128
    vld           vr0,       t2,       256*2       // in5
    vld           vr1,       t4,       256*5       // in27
    vld           vr2,       t4,       256*2       // in21
    vld           vr3,       t2,       256*5       // in11
    dct64_step1_lsx

    addi.d        t0,        t0,       48
    addi.d        t6,        t6,       128
    vld           vr0,       t2,       256*1       // in3
    vld           vr1,       t4,       256*6       // in29
    vld           vr2,       t4,       256*1       // in19
    vld           vr3,       t2,       256*6       // in13
    dct64_step1_lsx

    la.local      t0,       idct_coeffs
    addi.d        t4,       t5,       16*7
    // t32a/t39/t40a/t47/t48/t55a/t56/t63a
    dct64_step2_lsx

    addi.d        t5,       t5,       16
    addi.d        t4,       t4,       -16
    // t33/t38a/t41/t46a/t49a/t54/t57a/t62
    dct64_step2_lsx

    addi.d        t5,       t5,       16
    addi.d        t4,       t4,       -16
    // t34a/t37/t42a/t45/t50/t53a/t58/t61a
    dct64_step2_lsx

    addi.d        t5,       t5,       16
    addi.d        t4,       t4,       -16
    // t35/t36a/t43/t44a/t51a/t52/t59a/t60
    dct64_step2_lsx

    li.w          t4,       64*32*2+64+512
    add.d         t5,       t4,       sp
    addi.d        t4,       t5,       16*7
    addi.d        a0,       a0,       \in1
    // 0 - 7, 56 -63
    dct64_step3_lsx
    li.w          t8,       0
    mul.w         t0,       t8,       a1
    add.d         t0,       a0,       t0
    alsl.d        t6,       a1,       t0,      1
    addi.d        t1,       t0,       0
    add.d         t2,       t0,       a1
    dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1
    li.w          t8,       56
    mul.w         t0,       t8,       a1
    add.d         t0,       a0,       t0
    alsl.d        t6,       a1,       t0,      1
    addi.d        t1,       t0,       0
    add.d         t2,       t0,       a1
    dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21

    // 8 - 15, 48 - 55
    addi.d        t3,       t3,       128
    addi.d        t4,       t4,       -16*8
    addi.d        t5,       t5,       -16*8
    dct64_step3_lsx
    li.w          t8,       8
    mul.w         t0,       t8,       a1
    add.d         t0,       t0,       a0
    alsl.d        t6,       a1,       t0,     1
    addi.d        t1,       t0,       0
    add.d         t2,       t0,       a1
    dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1
    li.w          t8,       48
    mul.w         t0,       t8,       a1
    add.d         t0,       t0,       a0
    alsl.d        t6,       a1,       t0,     1
    addi.d        t1,       t0,       0
    add.d         t2,       t0,       a1
    dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21

    // 16 - 23, 40 - 47
    addi.d        t3,       t3,       128
    addi.d        t4,       t4,       -16*8
    addi.d        t5,       t5,       -16*8
    dct64_step3_lsx
    li.w          t8,       16
    mul.w         t0,       t8,       a1
    add.d         t0,       t0,       a0
    alsl.d        t6,       a1,       t0,     1
    addi.d        t1,       t0,       0
    add.d         t2,       t0,       a1
    dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1
    li.w          t8,       40
    mul.w         t0,       t8,       a1
    add.d         t0,       t0,       a0
    alsl.d        t6,       a1,       t0,     1
    addi.d        t1,       t0,       0
    add.d         t2,       t0,       a1
    dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21

    // 24 - 31, 32 - 39
    addi.d        t3,       t3,       128
    addi.d        t4,       t4,       -16*8
    addi.d        t5,       t5,       -16*8
    dct64_step3_lsx
    li.w          t8,       24
    mul.w         t0,       t8,       a1
    add.d         t0,       t0,       a0
    alsl.d        t6,       a1,       t0,     1
    addi.d        t1,       t0,       0
    add.d         t2,       t0,       a1
    dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1
    li.w          t8,       32
    mul.w         t0,       t8,       a1
    add.d         t0,       t0,       a0
    alsl.d        t6,       a1,       t0,     1
    addi.d        t1,       t0,       0
    add.d         t2,       t0,       a1
    dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21
.endm
    dct64x64_core2_lsx 16*0, 0, no_rect2
    dct64x64_core2_lsx 16*1, 8, no_rect2
    dct64x64_core2_lsx 16*2, 8, no_rect2
    dct64x64_core2_lsx 16*3, 8, no_rect2
    dct64x64_core2_lsx 16*4, 8, no_rect2
    dct64x64_core2_lsx 16*5, 8, no_rect2
    dct64x64_core2_lsx 16*6, 8, no_rect2
    dct64x64_core2_lsx 16*7, 8, no_rect2

    free_space 64*32*2+512+512
.DCT_DCT_64X64_END:
endfunc

function inv_txfm_add_dct_dct_64x32_8bpc_lsx
    bnez          a3,       .NO_HAS_DCONLY_64x32

    idct_dc_w64 64, 32, 1

    DST_ADD_W64 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20

    li.w          t3,       31
.loop31:
    add.d         a0,       a0,       a1
    vld           vr10,     a0,       0
    vld           vr11,     a0,       16
    vld           vr12,     a0,       32
    vld           vr13,     a0,       48
    DST_ADD_W64 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20
    addi.d        t3,       t3,       -1
    blt           zero,     t3,       .loop31
    b             .DCT_DCT_64X32_END
.NO_HAS_DCONLY_64x32:
    malloc_space  64*32*2+512+512

    la.local      t8,       eob_32x32
    addi.d        t2,       a2,       0
    addi.d        t7,       sp,       64
    addi.d        t7,       t7,       0
    addi.d        a4,       a2,       64
.DCT_DCT_EOB_64x32:
    ld.h          a5,       t8,       0
    addi.d        t8,       t8,       2
    dct64x64_core1_lsx 1, rect2_lsx
    addi.d        t2,       t2,       16
    addi.d        t7,       t7,       128*8
    addi.d        a4,       a4,       16
    bge           a3,       a5,       .DCT_DCT_EOB_64x32

    la.local      t8,       eob_32x32
    vxor.v        vr31,     vr31,     vr31

    ld.h          t7,       t8,       4
    bge           a3,       t7,       .DCT_DCT_EOB_64x32_END
    li.d          t1,       1024*3+64
    add.d         t0,       sp,       t1
.rept 4
    vst_x16 t0, 0, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
            vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
    addi.d t0, t0, 256
.endr

    ld.h          t7,       t8,       2
    bge           a3,       t7,       .DCT_DCT_EOB_64x32_END
    li.d          t1,       1024*2+64
    add.d         t0,       sp,       t1
.rept 4
    vst_x16 t0, 0, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
            vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
    addi.d        t0,       t0,       256
.endr

    ld.h          t7,       t8,       0
    bge           a3,       t7,       .DCT_DCT_EOB_64x32_END
    li.d          t1,       1024*1+64
    add.d         t0,       sp,       t1
.rept 4
    vst_x16 t0, 0, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
            vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
    addi.d        t0,       t0,       256
.endr

.DCT_DCT_EOB_64x32_END:
    addi.d        t2,       sp,       64
    li.w          t4,       64*32*2+64
    add.d         t3,       sp,       t4
    addi.d        t5,       sp,       64
    addi.d        t5,       t5,       1024
    addi.d        t5,       t5,       1024
.rept 8
    vld_x8 t2, 0, 256, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7

    addi.d        t4,       t2,       1024
    addi.d        t4,       t4,       1024

    vld_x8 t4, 0, 256, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15

    inv_dct16_lsx no_rect2

    vst_x16 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15

    addi.d        t4,       t2,       128
    vld_x8 t4, 0, 256, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7

    addi.d        t4,       t4,       1024
    addi.d        t4,       t4,       1024

    vld_x8 t4, 0, 256, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30

    dct_8x32_core_lsx t5, t3, 0, 128, 16, -2048, 1024, -1024, 0, 128, , 4

    addi.d        t2,       t2,       16
    addi.d        t5,       t5,       16
    addi.d        t1,       t1,       16
.endr
    addi.d        t2,       sp,       64
    li.w          t3,       32
.loop32:
    vld           vr10,     a0,       0
    vld           vr11,     a0,       16
    vld           vr12,     a0,       32
    vld           vr13,     a0,       48
    vld_x8 t2, 0, 16, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
    DST_ADD_W64 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
    add.d         a0,       a0,       a1
    addi.d        t2,       t2,       128
    addi.d        t3,       t3,       -1
    blt           zero,     t3,       .loop32

    free_space  64*32*2+512+512
.DCT_DCT_64X32_END:
endfunc

.macro VLD_DST_ADD_W8_H32 in0
    vld           vr4,      t3,       0
    vld           vr5,      t3,       16
    vld           vr6,      t3,       32
    vld           vr7,      t3,       48
    VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
    addi.d        t3,       t3,       64
    add.d         a0,       a1,       a0
    alsl.d        t2,       a1,       t2,     2
    vld           vr4,      t3,       0
    vld           vr5,      t3,       16
    vld           vr6,      t3,       32
    vld           vr7,      t3,       48
    VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
    addi.d        t3,       sp,       \in0
    add.d         a0,       a1,       a0
    alsl.d        t2,       a1,       t2,     2
.endm

function inv_txfm_add_dct_dct_8x32_8bpc_lsx
    bnez          a3,       .NO_HAS_DCONLY_8x32

    idct_dc 8, 32, 2

    DST_ADD_W8 vr10, vr11, vr12, vr13, vr20, vr20, vr20, vr20
.rept 7
    add.d         a0,       a1,       a0
    alsl.d        t2,       a1,       a0,     1

    VLD_DST_ADD_W8 vr20, vr20, vr20, vr20
.endr
    b             .DCT_DCT_8X32_END
.NO_HAS_DCONLY_8x32:
    malloc_space 512

    la.local      t8,       eob_8x32
    addi.d        t3,       sp,       64
    addi.d        t2,       a2,       0
.DCT_DCT_EOB_8x32:
    ld.h          t7,       t8,       0
    addi.d        t8,       t8,       2

    vld_x8 a2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7

    inv_dct8_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, .8h

.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
    vsrari.h      \i,       \i,       2
.endr

    vxor.v        vr31,     vr31,     vr31
    vst_x8 a2, 0, 64, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31

    LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23

    vst_x8 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7

    addi.d        a2,       a2,       16
    addi.d        t3,       t3,       128
    bge           a3,       t7,       .DCT_DCT_EOB_8x32

    la.local      t8,       eob_8x32
    vxor.v        vr31,     vr31,     vr31
    ld.h          t7,       t8,       4
    bge           a3,       t7,       .DCT_DCT_EOB_8x32_END
    vst_x8 sp, 64+384, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31

    ld.h          t7,       t8,       2
    bge           a3,       t7,       .DCT_DCT_EOB_8x32_END
    vst_x8 sp, 64+256, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31

    ld.h          t7,       t8,       0
    bge           a3,       t7,       .DCT_DCT_EOB_8x32_END
    vst_x8 sp, 64+128, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
.DCT_DCT_EOB_8x32_END:
    addi.d        t2,       sp,       64
    addi.d        t3,       sp,       64

    vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15

    inv_dct16_lsx .8h

    vst_x16 t3, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15

    vld_x16 t2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
            vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30

    dct_8x32_core_lsx t2, t3, 0, 256, 32, 0, 128, 256, 384, 16, , 4

    alsl.d        t2,       a1,       a0,     1
    addi.d        t3,       sp,       64

    VLD_DST_ADD_W8_H32 320
    VLD_DST_ADD_W8_H32 448
    VLD_DST_ADD_W8_H32 192
    VLD_DST_ADD_W8_H32 0

    free_space 512
.DCT_DCT_8X32_END:
endfunc

function inv_txfm_add_identity_identity_8x32_8bpc_lsx
    la.local      t7,       eob_8x32
    alsl.d        t2,       a1,       a0,     1

.IDENTITY_IDENTITY_EOB_8x32:
    ld.h          t6,       t7,       0
    addi.d        t7,       t7,       2
    vld_x8 a2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7

    vxor.v        vr23,     vr23,     vr23
    vst_x8 a2, 0, 64, vr23, vr23, vr23, vr23, vr23, vr23, vr23, vr23

.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
    vsrari.h       \i,       \i,       1
.endr

    LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
                   vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23, \
                   vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15

.irp i, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
    vsrari.h       \i,       \i,       2
.endr
    VLD_DST_ADD_W8 vr16, vr17, vr18, vr19
    add.d         a0,       a1,       a0
    alsl.d        t2,       a1,       a0,     1

    VLD_DST_ADD_W8 vr20, vr21, vr22, vr23
    add.d         a0,       a1,       a0
    alsl.d        t2,       a1,       a0,     1

    addi.d        a2,       a2,       16
    bge           a3,       t6,       .IDENTITY_IDENTITY_EOB_8x32
endfunc

.macro def_fn_16x4_base txfm
functionl inv_txfm_\txfm\()add_16x4_lsx
    vld_x8 a2, 0, 16, vr0, vr2, vr4, vr6, vr8, vr10, vr12, vr14

.ifc \txfm, identity_
    li.w          t0,       1697
    vreplgr2vr.w  vr20,     t0
.irp i, vr0, vr2, vr4, vr6, vr8, vr10, vr12, vr14
    inv_identity16_lsx \i, vr20, \i, \i, .8h
.endr

    vilvh.d       vr1,      vr0,      vr0
    vilvh.d       vr3,      vr2,      vr2
    vilvh.d       vr5,      vr4,      vr4
    vilvh.d       vr7,      vr6,      vr6
    vilvh.d       vr9,      vr8,      vr8
    vilvh.d       vr11,     vr10,     vr10
    vilvh.d       vr13,     vr12,     vr12
    vilvh.d       vr15,     vr14,     vr14
.else
    vilvh.d       vr1,      vr0,      vr0
    vilvh.d       vr3,      vr2,      vr2
    vilvh.d       vr5,      vr4,      vr4
    vilvh.d       vr7,      vr6,      vr6
    vilvh.d       vr9,      vr8,      vr8
    vilvh.d       vr11,     vr10,     vr10
    vilvh.d       vr13,     vr12,     vr12
    vilvh.d       vr15,     vr14,     vr14

    move          t6,       ra
    jirl          ra,       t7,       0
    move          ra,       t6
.endif

    vxor.v        vr23,     vr23,     vr23
    vst_x8 a2, 0, 16, vr23, vr23, vr23, vr23, vr23, vr23, vr23, vr23

    LSX_TRANSPOSE8x4_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr0, vr1, \
                       vr2, vr3, vr16, vr17, vr18, vr19, vr20, vr21

    LSX_TRANSPOSE8x4_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, vr4, \
                       vr5, vr6, vr7, vr16, vr17, vr18, vr19, vr20, vr21

    vsrari.h      vr0,      vr0,      1
    vsrari.h      vr1,      vr1,      1
    vsrari.h      vr2,      vr2,      1
    vsrari.h      vr3,      vr3,      1
    move          t6,       ra
    jirl          ra,       t8,       0
    move          ra,       t6

    vsrari.h      vr8,      vr0,      4
    vsrari.h      vr9,      vr1,      4
    vsrari.h      vr10,     vr2,      4
    vsrari.h      vr11,     vr3,      4
    vsrari.h      vr0,      vr4,      1
    vsrari.h      vr1,      vr5,      1
    vsrari.h      vr2,      vr6,      1
    vsrari.h      vr3,      vr7,      1

    move          t6,       ra
    jirl          ra,       t8,       0
    move          ra,       t6

    vsrari.h      vr16,     vr0,      4
    vsrari.h      vr17,     vr1,      4
    vsrari.h      vr18,     vr2,      4
    vsrari.h      vr19,     vr3,      4

    alsl.d        t2,       a1,       a0,    1
    VLD_DST_ADD_W16 vr8, vr16, vr9, vr17, vr10, vr18, vr11, vr19
endfuncl
.endm

def_fn_16x4_base identity_
def_fn_16x4_base

.macro fn_16x4 txfm1, txfm2
function inv_txfm_add_\txfm1\()_\txfm2\()_16x4_8bpc_lsx
.ifc \txfm1\()_\txfm2, dct_dct
    bnez          a3,       .NO_HAS_DCONLY_16x4

    idct_dc 16, 4, 1

    DST_ADD_W16 vr10, vr11, vr12, vr13, vr20, vr20, vr20, \
                vr20, vr20, vr20, vr20, vr20
    b             .\txfm1\()_\txfm2\()_16x4_END
.NO_HAS_DCONLY_16x4:
.endif

.ifnc \txfm1, identity
    la.local     t7,    inv_\txfm1\()_4h_x16_lsx
.endif
    la.local     t8,    inv_\txfm2\()_8h_x4_lsx

.ifc \txfm1, identity
    b            inv_txfm_identity_add_16x4_lsx
.else
    b            inv_txfm_add_16x4_lsx
.endif
.\txfm1\()_\txfm2\()_16x4_END:
endfunc
.endm

fn_16x4 dct, dct
fn_16x4 identity, identity
fn_16x4 adst, dct

.macro VLD_DST_ADD_W16_H32 in0
    vld           vr14,     t3,       0
    vld           vr15,     t3,       16
    vld           vr16,     t3,       32
    vld           vr17,     t3,       48
    vld           vr18,     t5,       0
    vld           vr19,     t5,       16
    vld           vr20,     t5,       32
    vld           vr21,     t5,       48
    vsrari_h_x8 vr14, vr18, vr15, vr19, vr16, vr20, vr17, vr21, \
                vr14, vr18, vr15, vr19, vr16, vr20, vr17, vr21, 4
    VLD_DST_ADD_W16 vr14, vr18, vr15, vr19, vr16, vr20, vr17, vr21
    alsl.d        a0,       a1,       a0,    2
    alsl.d        t2,       a1,       t2,    2
    addi.d        t3,       t3,       64
    addi.d        t5,       t5,       64
    vld           vr14,     t3,       0
    vld           vr15,     t3,       16
    vld           vr16,     t3,       32
    vld           vr17,     t3,       48
    vld           vr18,     t5,       0
    vld           vr19,     t5,       16
    vld           vr20,     t5,       32
    vld           vr21,     t5,       48
    vsrari_h_x8 vr14, vr18, vr15, vr19, vr16, vr20, vr17, vr21, \
                vr14, vr18, vr15, vr19, vr16, vr20, vr17, vr21, 4
    VLD_DST_ADD_W16 vr14, vr18, vr15, vr19, vr16, vr20, vr17, vr21
    alsl.d        a0,       a1,       a0,    2
    alsl.d        t2,       a1,       t2,    2
    addi.d        t3,       sp,       \in0
    addi.d        t5,       sp,       \in0+512
.endm

function inv_txfm_add_dct_dct_16x32_8bpc_lsx
    bnez          a3,       .NO_HAS_DCONLY_16x32

    idct_dc 16, 32, 1

    DST_ADD_W16 vr10, vr11, vr12, vr13, vr20, vr20, vr20, \
                    vr20, vr20, vr20, vr20, vr20
.rept 7
    alsl.d        a0,       a1,       a0,     2
    alsl.d        t2,       a1,       a0,     1

    VLD_DST_ADD_W16 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20
.endr
    b             .DCT_DCT_16x32_END
.NO_HAS_DCONLY_16x32:
    malloc_space 512+512

    addi.d        t3,       sp,       64
    la.local      t8,       eob_16x32

.DCT_DCT_EOB_16x32:
    ld.h          t7,       t8,       0
    addi.d        t8,       t8,       2
    vld_x16 a2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15

    vxor.v        vr31,     vr31,     vr31
.irp i, 0, 64, 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960
    vst           vr31,     a2,       \i
.endr

    li.w          t0,       2896
    vreplgr2vr.w  vr23,     t0
.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
     vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
    rect2_lsx   \i, vr23, \i
.endr

    inv_dct16_lsx .8h

    LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23

    LSX_TRANSPOSE8x8_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
                       vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23

.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
    vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
    vsrari.h       \i,       \i,       1
.endr

    vst_x8 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7,
    vst_x8 t3, 512, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15

    addi.d        a2,       a2,       16
    addi.d        t3,       t3,       128
    bge           a3,       t7,       .DCT_DCT_EOB_16x32

    la.local      t8,       eob_16x32
    vxor.v        vr31,     vr31,     vr31

    ld.h          t7,       t8,       4
    bge           a3,       t7,       .DCT_DCT_EOB_16x32_END
    vst_x8 sp, 64+384, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
    vst_x8 sp, 64+896, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31

    ld.h          t7,       t8,       2
    bge           a3,       t7,       .DCT_DCT_EOB_16x32_END
    vst_x8 sp, 64+256, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
    vst_x8 sp, 64+768, 16,  vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31

    ld.h          t7,       t8,       0
    bge           a3,       t7,       .DCT_DCT_EOB_16x32_END
    vst_x8 sp, 64+128, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
    vst_x8 sp, 64+512+128, 16  vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31

.DCT_DCT_EOB_16x32_END:
    addi.d      t7,   sp,    64
.rept 2
    vld_x16 t7, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15

    inv_dct16_lsx .8h

    vst_x16 t7, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15

    vld_x16 t7, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
            vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30

    dct_8x32_core_lsx t7, t7, 0, 256, 32, 0, 128, 256, 384, 16, ,

    addi.d        t7,       t7,       512
.endr
    alsl.d        t2,       a1,       a0,    1
    addi.d        t3,       sp,       64
    addi.d        t5,       sp,       512+64

    VLD_DST_ADD_W16_H32 320
    VLD_DST_ADD_W16_H32 448
    VLD_DST_ADD_W16_H32 192
    VLD_DST_ADD_W16_H32 0

    free_space 512+512
.DCT_DCT_16x32_END:
endfunc

.macro xvmulev_xvmaddod_lasx in0, in1, in2, in3, out0, out1
    xvmulwev.w.h   \out0,    \in0,     \in2
    xvmulwod.w.h   \out1,    \in0,     \in2
    xvmaddwev.w.h  \out0,    \in1,     \in3
    xvmaddwod.w.h  \out1,    \in1,     \in3
.endm

.macro xvsrari_h_x16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \
                     in11, in12, in13, in14, in15, out0, out1, out2, out3, \
                     out4, out5, out6, out7, out8, out9, out10, out11, out12, \
                     out13, out14, out15, shift
    xvsrari.h  \out0,       \in0,     \shift
    xvsrari.h  \out1,       \in1,     \shift
    xvsrari.h  \out2,       \in2,     \shift
    xvsrari.h  \out3,       \in3,     \shift
    xvsrari.h  \out4,       \in4,     \shift
    xvsrari.h  \out5,       \in5,     \shift
    xvsrari.h  \out6,       \in6,     \shift
    xvsrari.h  \out7,       \in7,     \shift
    xvsrari.h  \out8,       \in8,     \shift
    xvsrari.h  \out9,       \in9,     \shift
    xvsrari.h  \out10,      \in10,    \shift
    xvsrari.h  \out11,      \in11,    \shift
    xvsrari.h  \out12,      \in12,    \shift
    xvsrari.h  \out13,      \in13,    \shift
    xvsrari.h  \out14,      \in14,    \shift
    xvsrari.h  \out15,      \in15,    \shift
.endm

.macro xvpermi_q_x2 in0, in1, in2, in3, out0, out1, out2, out3, tmp0, tmp1
    xvor.v      \tmp0,      \in0,     \in0
    xvor.v      \tmp1,      \in1,     \in1
    xvpermi.q   \out0,      \in2,     0x02
    xvpermi.q   \out1,      \in3,     0x02
    xvpermi.q   \out2,      \tmp0,    0x31
    xvpermi.q   \out3,      \tmp1,    0x31
.endm

.macro DST_ADD_W16_LASX in0, in1, in2, in3, in4, in5, in6, in7
    vext2xv.hu.bu xr0,      \in0
    vext2xv.hu.bu xr1,      \in1
    vext2xv.hu.bu xr2,      \in2
    vext2xv.hu.bu xr3,      \in3
    xvadd.h       xr0,      xr0,      \in4
    xvadd.h       xr1,      xr1,      \in5
    xvadd.h       xr2,      xr2,      \in6
    xvadd.h       xr3,      xr3,      \in7
    xvssrani.bu.h xr1,      xr0,      0
    xvssrani.bu.h xr3,      xr2,      0
    xvpermi.d     xr0,      xr1,      0b11011000
    xvpermi.d     xr2,      xr3,      0b11011000
    xvpermi.d     xr1,      xr0,      0b00001110
    xvpermi.d     xr3,      xr2,      0b00001110
    vst           vr0,      a0,       0
    vstx          vr1,      a0,       a1
    vst           vr2,      t2,       0
    vstx          vr3,      t2,       a1
.endm

.macro XVLD_DST_ADD_W16 in0, in1, in2, in3
    vld           vr0,      a0,       0
    vldx          vr1,      a0,       a1
    vld           vr2,      t2,       0
    vldx          vr3,      t2,       a1
    DST_ADD_W16_LASX xr0, xr1, xr2, xr3, \in0, \in1, \in2, \in3
.endm

.macro inv_adst16_lasx
    la.local      t0,       iadst16_coeffs_h

    xvldrepl.h    xr20,     t0,       0        // 4091
    xvldrepl.h    xr21,     t0,       2        // 201
    xvmulev_xvmaddod_lasx xr15, xr0, xr20, xr21, xr16, xr18
    xvneg.h       xr20,     xr20
    xvmulev_xvmaddod_lasx xr15, xr0, xr21, xr20, xr17, xr19
    xvilvl.w      xr15,     xr18,     xr16
    xvilvl.w      xr0,      xr19,     xr17
    xvilvh.w      xr18,     xr18,     xr16
    xvilvh.w      xr19,     xr19,     xr17
    xvssrarni.h.w xr18,     xr15,     12       // t0
    xvssrarni.h.w xr19,     xr0,      12       // t1

    xvldrepl.h    xr20,     t0,       4        // 3973
    xvldrepl.h    xr21,     t0,       6        // 995
    xvmulev_xvmaddod_lasx xr13, xr2, xr20, xr21, xr16, xr0
    xvneg.h       xr20,     xr20
    xvmulev_xvmaddod_lasx xr13, xr2, xr21, xr20, xr17, xr15
    xvilvl.w      xr13,     xr0,      xr16
    xvilvl.w      xr2,      xr15,     xr17
    xvilvh.w      xr0,      xr0,      xr16
    xvilvh.w      xr15,     xr15,     xr17
    xvssrarni.h.w xr0,      xr13,     12       // t2
    xvssrarni.h.w xr15,     xr2,      12       // t3

    xvldrepl.h    xr20,     t0,        8       // 3703
    xvldrepl.h    xr21,     t0,        10      // 1751
    xvmulev_xvmaddod_lasx xr11, xr4, xr20, xr21, xr16, xr2
    xvneg.h       xr20,     xr20
    xvmulev_xvmaddod_lasx xr11, xr4, xr21, xr20, xr17, xr13
    xvilvl.w      xr11,     xr2,       xr16
    xvilvl.w      xr4,      xr13,      xr17
    xvilvh.w      xr2,      xr2,       xr16
    xvilvh.w      xr13,     xr13,      xr17
    xvssrarni.h.w xr2,      xr11,      12       // t4
    xvssrarni.h.w xr13,     xr4,       12       // t5

    xvldrepl.h    xr20,     t0,        12       // 3290 -> 1645
    xvldrepl.h    xr21,     t0,        14       // 2440 -> 1220
    xvmulev_xvmaddod_lasx xr9, xr6, xr20, xr21, xr16, xr4
    xvneg.h       xr20,     xr20
    xvmulev_xvmaddod_lasx xr9, xr6, xr21, xr20, xr17, xr11
    xvilvl.w      xr9,      xr4,       xr16
    xvilvl.w      xr6,      xr11,      xr17
    xvilvh.w      xr4,      xr4,       xr16
    xvilvh.w      xr11,     xr11,      xr17
    xvssrarni.h.w xr4,      xr9,       12       // t6
    xvssrarni.h.w xr11,     xr6,       12       // t7

    xvldrepl.h    xr20,     t0,        16       // 2751
    xvldrepl.h    xr21,     t0,        18       // 3035
    xvmulev_xvmaddod_lasx xr7, xr8, xr20, xr21, xr16, xr6
    xvneg.h       xr20,     xr20
    xvmulev_xvmaddod_lasx xr7, xr8, xr21, xr20, xr17, xr9
    xvilvl.w      xr7,      xr6,       xr16
    xvilvl.w      xr8,      xr9,       xr17
    xvilvh.w      xr6,      xr6,       xr16
    xvilvh.w      xr9,      xr9,       xr17
    xvssrarni.h.w xr6,      xr7,       12       // t8
    xvssrarni.h.w xr9,      xr8,       12       // t9

    xvldrepl.h    xr20,     t0,        20       // 2106
    xvldrepl.h    xr21,     t0,        22       // 3513
    xvmulev_xvmaddod_lasx xr5, xr10, xr20, xr21, xr16, xr7
    xvneg.h       xr20,     xr20
    xvmulev_xvmaddod_lasx xr5, xr10, xr21, xr20, xr17, xr8
    xvilvl.w      xr5,      xr7,       xr16
    xvilvl.w      xr10,     xr8,       xr17
    xvilvh.w      xr7,      xr7,       xr16
    xvilvh.w      xr8,      xr8,       xr17
    xvssrarni.h.w xr7,      xr5,       12       // t10
    xvssrarni.h.w xr8,      xr10,      12       // t11

    xvldrepl.h    xr20,     t0,        24       // 1380
    xvldrepl.h    xr21,     t0,        26       // 3857
    xvmulev_xvmaddod_lasx xr3, xr12, xr20, xr21, xr16, xr5
    xvneg.h       xr20,     xr20
    xvmulev_xvmaddod_lasx xr3, xr12, xr21, xr20, xr17, xr10
    xvilvl.w      xr3,      xr5,       xr16
    xvilvl.w      xr12,     xr10,      xr17
    xvilvh.w      xr5,      xr5,       xr16
    xvilvh.w      xr10,     xr10,      xr17
    xvssrarni.h.w xr5,      xr3,       12       // t12
    xvssrarni.h.w xr10,     xr12,      12       // t13

    xvldrepl.h    xr20,     t0,        28       // 601
    xvldrepl.h    xr21,     t0,        30       // 4052
    xvmulev_xvmaddod_lasx xr1, xr14, xr20, xr21, xr16, xr3
    xvneg.h       xr20,     xr20
    xvmulev_xvmaddod_lasx xr1, xr14, xr21, xr20, xr17, xr12
    xvilvl.w      xr1,      xr3,       xr16
    xvilvl.w      xr14,     xr12,      xr17
    xvilvh.w      xr3,      xr3,       xr16
    xvilvh.w      xr12,     xr12,      xr17
    xvssrarni.h.w xr3,      xr1,       12       // t14
    xvssrarni.h.w xr12,     xr14,      12       // t15

    xvsadd.h      xr1,      xr18,      xr6      // t0a
    xvssub.h      xr14,     xr18,      xr6      // t8a
    xvsadd.h      xr16,     xr19,      xr9      // t1a
    xvssub.h      xr17,     xr19,      xr9      // t9a
    xvsadd.h      xr6,      xr0,       xr7      // t2a
    xvssub.h      xr18,     xr0,       xr7      // t10a
    xvsadd.h      xr9,      xr15,      xr8      // t3a
    xvssub.h      xr19,     xr15,      xr8      // t11a
    xvsadd.h      xr0,      xr2,       xr5      // t4a
    xvssub.h      xr7,      xr2,       xr5      // t12a
    xvsadd.h      xr8,      xr13,      xr10     // t5a
    xvssub.h      xr15,     xr13,      xr10     // t13a
    xvsadd.h      xr2,      xr4,       xr3      // t6a
    xvssub.h      xr5,      xr4,       xr3      // t14a
    xvsadd.h      xr10,     xr11,      xr12     // t7a
    xvssub.h      xr13,     xr11,      xr12     // t15a

    la.local      t0,       idct_coeffs_h

    xvldrepl.h    xr20,     t0,        8        // 799
    xvldrepl.h    xr21,     t0,        10       // 4017
    xvmulev_xvmaddod_lasx xr14, xr17, xr21, xr20, xr3, xr11
    xvneg.h       xr21,     xr21
    xvmulev_xvmaddod_lasx xr14, xr17, xr20, xr21, xr4, xr12
    xvilvl.w      xr14,     xr11,      xr3
    xvilvl.w      xr17,     xr12,      xr4
    xvilvh.w      xr11,     xr11,      xr3
    xvilvh.w      xr12,     xr12,      xr4
    xvssrarni.h.w xr11,     xr14,      12       // t8
    xvssrarni.h.w xr12,     xr17,      12       // t9

    xvneg.h       xr21,     xr21
    xvmulev_xvmaddod_lasx xr15, xr7, xr20, xr21, xr3, xr14
    xvneg.h       xr20,     xr20
    xvmulev_xvmaddod_lasx xr15, xr7, xr21, xr20, xr4, xr17
    xvilvl.w      xr15,     xr14,      xr3
    xvilvl.w      xr7,      xr17,      xr4
    xvilvh.w      xr14,     xr14,      xr3
    xvilvh.w      xr17,     xr17,      xr4
    xvssrarni.h.w xr14,     xr15,      12       // t13
    xvssrarni.h.w xr17,     xr7,       12       // t12

    xvldrepl.h    xr20,     t0,        12       // 3406
    xvldrepl.h    xr21,     t0,        14       // 2276
    xvmulev_xvmaddod_lasx xr18, xr19, xr21, xr20, xr3, xr7
    xvneg.h       xr21,     xr21
    xvmulev_xvmaddod_lasx xr18, xr19, xr20, xr21, xr4, xr15
    xvilvl.w      xr18,     xr7,       xr3
    xvilvl.w      xr19,     xr15,      xr4
    xvilvh.w      xr7,      xr7,       xr3
    xvilvh.w      xr15,     xr15,      xr4
    xvssrarni.h.w xr7,      xr18,      12       // t10
    xvssrarni.h.w xr15,     xr19,      12       // t11

    xvneg.h       xr21,     xr21
    xvmulev_xvmaddod_lasx xr13, xr5, xr20, xr21, xr3, xr18
    xvneg.h       xr20,     xr20
    xvmulev_xvmaddod_lasx xr13, xr5, xr21, xr20, xr4, xr19
    xvilvl.w      xr13,     xr18,      xr3
    xvilvl.w      xr5,      xr19,      xr4
    xvilvh.w      xr18,     xr18,      xr3
    xvilvh.w      xr19,     xr19,      xr4
    xvssrarni.h.w xr18,     xr13,      12       // t15
    xvssrarni.h.w xr19,     xr5,       12       // t14

    xvsadd.h      xr5,      xr1,       xr0      // t0
    xvssub.h      xr13,     xr1,       xr0      // t4
    xvsadd.h      xr3,      xr16,      xr8      // t1
    xvssub.h      xr4,      xr16,      xr8      // t5
    xvsadd.h      xr0,      xr6,       xr2      // t2
    xvssub.h      xr1,      xr6,       xr2      // t6
    xvsadd.h      xr8,      xr9,       xr10     // t3
    xvssub.h      xr16,     xr9,       xr10     // t7
    xvsadd.h      xr2,      xr11,      xr17     // t8a
    xvssub.h      xr6,      xr11,      xr17     // t12a
    xvsadd.h      xr9,      xr12,      xr14     // t9a
    xvssub.h      xr10,     xr12,      xr14     // t13a
    xvsadd.h      xr11,     xr7,       xr19     // t10a
    xvssub.h      xr17,     xr7,       xr19     // t14a
    xvsadd.h      xr12,     xr15,      xr18     // t11a
    xvssub.h      xr14,     xr15,      xr18     // t15a

    la.local      t0,       idct_coeffs_h

    xvldrepl.h    xr20,     t0,        4        // 1567
    xvldrepl.h    xr21,     t0,        6        // 3784
    xvmulev_xvmaddod_lasx xr13, xr4, xr21, xr20, xr7, xr18
    xvneg.h       xr21,     xr21
    xvmulev_xvmaddod_lasx xr13, xr4, xr20, xr21, xr15, xr19
    xvilvl.w      xr13,     xr18,      xr7
    xvilvl.w      xr4,      xr19,      xr15
    xvilvh.w      xr18,     xr18,      xr7
    xvilvh.w      xr19,     xr19,      xr15
    xvssrarni.h.w xr18,     xr13,      12       // t4a
    xvssrarni.h.w xr19,     xr4,       12       // t5a

    xvneg.h       xr21,     xr21
    xvmulev_xvmaddod_lasx xr16, xr1, xr20, xr21, xr7, xr4
    xvneg.h       xr20,     xr20
    xvmulev_xvmaddod_lasx xr16, xr1, xr21, xr20, xr15, xr13
    xvilvl.w      xr16,     xr4,       xr7
    xvilvl.w      xr1,      xr13,      xr15
    xvilvh.w      xr4,      xr4,       xr7
    xvilvh.w      xr13,     xr13,      xr15
    xvssrarni.h.w xr4,      xr16,      12       // t7a
    xvssrarni.h.w xr13,     xr1,       12       // t6a

    xvneg.h       xr20,     xr20
    xvmulev_xvmaddod_lasx xr6, xr10, xr21, xr20, xr7, xr1
    xvneg.h       xr21,     xr21
    xvmulev_xvmaddod_lasx xr6, xr10, xr20, xr21, xr15, xr16
    xvilvl.w      xr6,      xr1,       xr7
    xvilvl.w      xr10,     xr16,      xr15
    xvilvh.w      xr1,      xr1,       xr7
    xvilvh.w      xr16,     xr16,      xr15
    xvssrarni.h.w xr1,      xr6,       12       // t12
    xvssrarni.h.w xr16,     xr10,      12       // t13

    xvneg.h       xr21,     xr21
    xvmulev_xvmaddod_lasx xr14, xr17, xr20, xr21, xr7, xr6
    xvneg.h       xr20,     xr20
    xvmulev_xvmaddod_lasx xr14, xr17, xr21, xr20, xr15, xr10
    xvilvl.w      xr14,     xr6,       xr7
    xvilvl.w      xr17,     xr10,      xr15
    xvilvh.w      xr6,      xr6,       xr7
    xvilvh.w      xr10,     xr10,      xr15
    xvssrarni.h.w xr6,      xr14,      12       // t15
    xvssrarni.h.w xr10,     xr17,      12       // t14

    xvsadd.h       xr14,     xr5,       xr0      // out[0]
    xvssub.h       xr17,     xr5,       xr0      // t2a
    xvssub.h       xr7,      xr3,       xr8      // t3a
    xvsadd.h       xr15,     xr3,       xr8      // out[15]
    xvsllwil.w.h   xr22,     xr15,      0
    xvexth.w.h     xr15,     xr15
    xvneg.w        xr22,     xr22
    xvneg.w        xr15,     xr15
    xvssrarni.h.w  xr15,     xr22,      0        // out[15]
    xvssub.h       xr7,      xr3,       xr8      // t3a

    xvsadd.h       xr3,      xr19,      xr4      // out[12]
    xvssub.h       xr8,      xr19,      xr4      // t7
    xvssub.h       xr0,      xr18,      xr13     // t6
    xvsadd.h       xr5,      xr18,      xr13     // out[3]
    xvsllwil.w.h   xr22,     xr5,       0
    xvexth.w.h     xr5,      xr5
    xvneg.w        xr22,     xr22
    xvneg.w        xr5,      xr5
    xvssrarni.h.w  xr5,      xr22,      0        // out[3]

    xvsadd.h       xr13,     xr9,       xr12     // out[14]
    xvssub.h       xr19,     xr9,       xr12     // t11
    xvssub.h       xr4,      xr2,       xr11     // t10
    xvsadd.h       xr18,     xr2,       xr11     // out[1]
    xvsllwil.w.h   xr22,     xr18,      0
    xvexth.w.h     xr18,     xr18
    xvneg.w        xr22,     xr22
    xvneg.w        xr18,     xr18
    xvssrarni.h.w  xr18,     xr22,      0        // out[1]

    xvsadd.h       xr2,      xr1,       xr10     // out[2]
    xvssub.h       xr11,     xr1,       xr10     // t14a
    xvssub.h       xr12,     xr16,      xr6      // t15a
    xvsadd.h       xr9,      xr16,      xr6      // out[13]
    xvsllwil.w.h   xr22,     xr9,       0
    xvexth.w.h     xr9,      xr9
    xvneg.w        xr22,     xr22
    xvneg.w        xr9,      xr9
    xvssrarni.h.w  xr9,      xr22,      0        // out[13]

    xvldrepl.h     xr20,     t0,        0        // 2896
    xvmulev_xvmaddod_lasx xr17, xr7, xr20, xr20, xr6, xr10
    xvneg.h        xr21,     xr20
    xvmulev_xvmaddod_lasx xr17, xr7, xr20, xr21, xr16, xr1
    xvilvl.w       xr17,     xr10,      xr6
    xvilvl.w       xr7,      xr1,       xr16
    xvilvh.w       xr10,     xr10,      xr6
    xvilvh.w       xr1,      xr1,       xr16
    xvssrarni.h.w  xr1,      xr7,       12       // out[8]
    xvsrari.w      xr17,     xr17,      12
    xvsrari.w      xr10,     xr10,      12
    xvneg.w        xr17,     xr17
    xvneg.w        xr10,     xr10
    xvssrarni.h.w  xr10,     xr17,      0        // out[7]

    xvmulev_xvmaddod_lasx xr0, xr8, xr20, xr21, xr16, xr17
    xvmulev_xvmaddod_lasx xr0, xr8, xr20, xr20, xr6, xr7
    xvilvl.w       xr0,      xr17,      xr16
    xvilvl.w       xr8,      xr7,       xr6
    xvilvh.w       xr17,     xr17,      xr16
    xvilvh.w       xr7,      xr7,       xr6
    xvssrarni.h.w  xr7,      xr8,       12       // out[4]
    xvsrari.w      xr0,      xr0,       12
    xvsrari.w      xr17,     xr17,      12
    xvneg.w        xr0,      xr0
    xvneg.w        xr17,     xr17
    xvssrarni.h.w xr17,      xr0,       0        // out[11]

    xvmulev_xvmaddod_lasx xr4, xr19, xr20, xr21, xr16, xr0
    xvmulev_xvmaddod_lasx xr4, xr19, xr20, xr20, xr6, xr8
    xvilvl.w       xr4,      xr0,       xr16
    xvilvl.w       xr19,     xr8,       xr6
    xvilvh.w       xr0,      xr0,       xr16
    xvilvh.w       xr8,      xr8,       xr6
    xvssrarni.h.w  xr8,      xr19,      12       // out[6]
    xvsrari.w      xr4,      xr4,       12
    xvsrari.w      xr0,      xr0,       12
    xvneg.w        xr4,      xr4
    xvneg.w        xr0,      xr0
    xvssrarni.h.w  xr0,      xr4,       0        // out[9]
    xvmulev_xvmaddod_lasx xr11, xr12, xr20, xr20, xr6, xr4
    xvmulev_xvmaddod_lasx xr11, xr12, xr20, xr21, xr16, xr19
    xvilvl.w       xr11,     xr4,       xr6
    xvilvl.w       xr12,     xr19,      xr16
    xvilvh.w       xr4,      xr4,       xr6
    xvilvh.w       xr19,     xr19,      xr16
    xvssrarni.h.w  xr19,     xr12,      12       // out[10]
    xvsrari.w      xr11,     xr11,      12
    xvsrari.w      xr4,      xr4,       12
    xvneg.w        xr11,     xr11
    xvneg.w        xr4,      xr4
    xvssrarni.h.w  xr4,      xr11,      0        // out[5]
.endm

function inv_txfm_add_adst_adst_16x16_8bpc_lasx
    PUSH_REG
    xvld_x16 a2, 0, 32, xr0, xr1, xr2, xr3, xr4, xr5, xr6, xr7, \
             xr8, xr9, xr10, xr11, xr12, xr13, xr14, xr15

    inv_adst16_lasx

    LASX_TRANSPOSE8x8_H xr14, xr18, xr2, xr5, xr7, xr4, xr8, xr10, \
                        xr14, xr18, xr2, xr5, xr7, xr28, xr6, xr10, \
                        xr20, xr21, xr22, xr23, xr24, xr25, xr26, xr27

    LASX_TRANSPOSE8x8_H xr1,  xr0,  xr19, xr17, xr3, xr9, xr13, xr15, \
                        xr29, xr30, xr11, xr17, xr31, xr19, xr16, xr15, \
                        xr20, xr21, xr22, xr23, xr24, xr25, xr26, xr27

    xvsrari_h_x16 xr14, xr18, xr2, xr5, xr7, xr28, xr6, xr10, \
                  xr29, xr30, xr11, xr17, xr31, xr19, xr16, xr15, \
                  xr0, xr1, xr2, xr3, xr4, xr5, xr6, xr7, \
                  xr8, xr9, xr10, xr11, xr12, xr13, xr14, xr15, 2

    xvpermi_q_x2 xr0, xr1, xr8, xr9, xr0, xr1, xr8, xr9, xr20, xr21
    xvpermi_q_x2 xr2, xr3, xr10, xr11, xr2, xr3, xr10, xr11, xr20, xr21
    xvpermi_q_x2 xr4, xr5, xr12, xr13, xr4, xr5, xr12, xr13, xr20, xr21
    xvpermi_q_x2 xr6, xr7, xr14, xr15, xr6, xr7, xr14, xr15, xr20, xr21

    inv_adst16_lasx

    xvsrari_h_x16 xr14, xr18, xr2,  xr5,  xr7,  xr4, xr8,  xr10, \
                  xr1,  xr0,  xr19, xr17, xr3,  xr9, xr13, xr15, \
                  xr14, xr18, xr11, xr5,  xr7,  xr4, xr8,  xr10, \
                  xr12, xr16, xr19, xr17, xr20, xr9, xr13, xr15, 4

    xvxor.v       xr23,     xr23,     xr23
.irp i, 0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480
    xvst          xr23,     a2,       \i
.endr
    alsl.d        t2,       a1,       a0,    1
    XVLD_DST_ADD_W16 xr14, xr18, xr11, xr5
    alsl.d        a0,       a1,       a0,    2
    alsl.d        t2,       a1,       a0,    1
    XVLD_DST_ADD_W16 xr7, xr4, xr8, xr10
    alsl.d        a0,       a1,       a0,    2
    alsl.d        t2,       a1,       a0,    1
    XVLD_DST_ADD_W16 xr12, xr16, xr19, xr17
    alsl.d        a0,       a1,       a0,    2
    alsl.d        t2,       a1,       a0,    1
    XVLD_DST_ADD_W16 xr20, xr9, xr13, xr15
    POP_REG
endfunc
